diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml index 6f9acb4cb0..36aedc0d60 100644 --- a/.github/workflows/develop.yml +++ b/.github/workflows/develop.yml @@ -5,13 +5,13 @@ on: branches: - develop paths: - - '*.bazel' - - '.bazelrc' - - '.github/workflows/**.yml' - - 'PACE' - - 'VERSION' - - 'bazel/**' + - 'build.zig' + - 'build.zig.zon' + - 'ext/**' + - '!ext/**.md' - 'pkg/**' + - '.github/workflows/**.yml' + - '*.sh' jobs: urbit: diff --git a/.github/workflows/feature.yml b/.github/workflows/feature.yml index 60444b89e3..d78b911e13 100644 --- a/.github/workflows/feature.yml +++ b/.github/workflows/feature.yml @@ -3,12 +3,13 @@ name: Feature pull request on: pull_request: paths: - - '.bazelrc' - - '.github/workflows/*.yml' - - 'BUILD.bazel' - - 'WORKSPACE.bazel' - - 'bazel/**' + - 'build.zig' + - 'build.zig.zon' + - 'ext/**' + - '!ext/**.md' - 'pkg/**' + - '.github/workflows/**.yml' + - '*.sh' jobs: urbit: diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 368c417b4f..02a2b84c58 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -5,14 +5,13 @@ on: branches: - master paths: - - '*.bazel' - - '.bazelrc' - - '.github/workflows/**.yml' - - 'PACE' - - 'VERSION' - - 'bazel/**' + - 'build.zig' + - 'build.zig.zon' + - 'ext/**' + - '!ext/**.md' - 'pkg/**' - + - '.github/workflows/**.yml' + - '*.sh' jobs: urbit: diff --git a/.github/workflows/next.yml b/.github/workflows/next.yml index 07250449e3..71738ad940 100644 --- a/.github/workflows/next.yml +++ b/.github/workflows/next.yml @@ -5,13 +5,13 @@ on: branches: - 'next/kelvin/*' paths: - - '*.bazel' - - '.bazelrc' - - '.github/workflows/**.yml' - - 'PACE' - - 'VERSION' - - 'bazel/**' + - 'build.zig' + - 'build.zig.zon' + - 'ext/**' + - '!ext/**.md' - 'pkg/**' + - '.github/workflows/**.yml' + - '*.sh' jobs: urbit: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 904884d5c1..ca4a907b07 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -5,13 +5,13 @@ on: branches: - release paths: - - '*.bazel' - - '.bazelrc' - - '.github/workflows/**.yml' - - 'PACE' - - 'VERSION' - - 'bazel/**' + - 'build.zig' + - 'build.zig.zon' + - 'ext/**' + - '!ext/**.md' - 'pkg/**' + - '.github/workflows/**.yml' + - '*.sh' jobs: urbit: diff --git a/.github/workflows/shared.yml b/.github/workflows/shared.yml index d9d503f82d..14af8e16d3 100644 --- a/.github/workflows/shared.yml +++ b/.github/workflows/shared.yml @@ -90,7 +90,6 @@ jobs: -Dpace=${{inputs.pace}} \ --summary all fi - - name: Run unit tests run: | zig build \ diff --git a/MODULE.bazel b/MODULE.bazel new file mode 100644 index 0000000000..00bb18361f --- /dev/null +++ b/MODULE.bazel @@ -0,0 +1,6 @@ +############################################################################### +# Bazel now uses Bzlmod by default to manage external dependencies. +# Please consider migrating your external dependencies from WORKSPACE to MODULE.bazel. +# +# For more details, please check https://github.com/bazelbuild/bazel/issues/18958 +############################################################################### diff --git a/bazel/third_party/softblas/BUILD.bazel b/bazel/third_party/softblas/BUILD.bazel new file mode 100644 index 0000000000..e69de29bb2 diff --git a/bazel/third_party/softblas/softblas.BUILD b/bazel/third_party/softblas/softblas.BUILD new file mode 100644 index 0000000000..34c80c93e6 --- /dev/null +++ b/bazel/third_party/softblas/softblas.BUILD @@ -0,0 +1,117 @@ +# FILEPATH: /home/neal/lagoon/vere/bazel/third_party/softblas/softblas.BUILD + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +cc_library( + name = "softblas", + visibility = ["//visibility:public"], + deps = select({ + "@platforms//cpu:aarch64": [":softblas_aarch64"], + "@platforms//cpu:x86_64": [":softblas_x86_64"], + "//conditions:default": [], + }), +) + +cc_library( + name = "softblas_aarch64", + visibility = ["//visibility:public"], + hdrs = ["include/softblas.h"], + includes = ["include"], + srcs = [ + "include/softblas.h", + "src/softblas_state.c", + "src/blas/level1/sasum.c", + "src/blas/level1/dasum.c", + "src/blas/level1/hasum.c", + "src/blas/level1/qasum.c", + "src/blas/level1/saxpy.c", + "src/blas/level1/daxpy.c", + "src/blas/level1/haxpy.c", + "src/blas/level1/qaxpy.c", + "src/blas/level1/scopy.c", + "src/blas/level1/dcopy.c", + "src/blas/level1/hcopy.c", + "src/blas/level1/qcopy.c", + "src/blas/level1/sdot.c", + "src/blas/level1/ddot.c", + "src/blas/level1/hdot.c", + "src/blas/level1/qdot.c", + "src/blas/level1/snrm2.c", + "src/blas/level1/dnrm2.c", + "src/blas/level1/hnrm2.c", + "src/blas/level1/qnrm2.c", + "src/blas/level1/sscal.c", + "src/blas/level1/dscal.c", + "src/blas/level1/hscal.c", + "src/blas/level1/qscal.c", + "src/blas/level1/sswap.c", + "src/blas/level1/dswap.c", + "src/blas/level1/hswap.c", + "src/blas/level1/qswap.c", + "src/blas/level1/isamax.c", + "src/blas/level1/idamax.c", + "src/blas/level1/ihamax.c", + "src/blas/level1/iqamax.c", + "src/blas/level2/sgemv.c", + "src/blas/level2/dgemv.c", + "src/blas/level2/hgemv.c", + "src/blas/level2/qgemv.c", + "src/blas/level3/sgemm.c", + "src/blas/level3/dgemm.c", + "src/blas/level3/hgemm.c", + "src/blas/level3/qgemm.c" + ], + deps = ["@softfloat"], +) + +cc_library( + name = "softblas_x86_64", + visibility = ["//visibility:public"], + hdrs = ["include/softblas.h"], + includes = ["include"], + srcs = [ + "include/softblas.h", + "src/softblas_state.c", + "src/blas/level1/sasum.c", + "src/blas/level1/dasum.c", + "src/blas/level1/hasum.c", + "src/blas/level1/qasum.c", + "src/blas/level1/saxpy.c", + "src/blas/level1/daxpy.c", + "src/blas/level1/haxpy.c", + "src/blas/level1/qaxpy.c", + "src/blas/level1/scopy.c", + "src/blas/level1/dcopy.c", + "src/blas/level1/hcopy.c", + "src/blas/level1/qcopy.c", + "src/blas/level1/sdot.c", + "src/blas/level1/ddot.c", + "src/blas/level1/hdot.c", + "src/blas/level1/qdot.c", + "src/blas/level1/snrm2.c", + "src/blas/level1/dnrm2.c", + "src/blas/level1/hnrm2.c", + "src/blas/level1/qnrm2.c", + "src/blas/level1/sscal.c", + "src/blas/level1/dscal.c", + "src/blas/level1/hscal.c", + "src/blas/level1/qscal.c", + "src/blas/level1/sswap.c", + "src/blas/level1/dswap.c", + "src/blas/level1/hswap.c", + "src/blas/level1/qswap.c", + "src/blas/level1/isamax.c", + "src/blas/level1/idamax.c", + "src/blas/level1/ihamax.c", + "src/blas/level1/iqamax.c", + "src/blas/level2/sgemv.c", + "src/blas/level2/dgemv.c", + "src/blas/level2/hgemv.c", + "src/blas/level2/qgemv.c", + "src/blas/level3/sgemm.c", + "src/blas/level3/dgemm.c", + "src/blas/level3/hgemm.c", + "src/blas/level3/qgemm.c" + ], + deps = ["@softfloat"], +) diff --git a/build.zig b/build.zig index 2220aab9c9..287b1bcb38 100644 --- a/build.zig +++ b/build.zig @@ -283,6 +283,11 @@ fn build_single( .optimize = optimize, }); + const softblas = b.dependency("softblas", .{ + .target = target, + .optimize = optimize, + }); + const softfloat = b.dependency("softfloat", .{ .target = target, .optimize = optimize, @@ -485,13 +490,12 @@ fn build_single( pkg_noun.linkLibrary(pdjson.artifact("pdjson")); pkg_noun.linkLibrary(sigsegv.artifact("sigsegv")); pkg_noun.linkLibrary(softfloat.artifact("softfloat")); + pkg_noun.linkLibrary(softblas.artifact("softblas")); if (t.os.tag == .linux) pkg_noun.linkLibrary(unwind.artifact("unwind")); pkg_noun.linkLibrary(urcrypt.artifact("urcrypt")); pkg_noun.linkLibrary(whereami.artifact("whereami")); - if (t.os.tag == .linux) - pkg_noun.linkLibrary(zlib.artifact("z")); - + pkg_noun.linkLibrary(zlib.artifact("z")); pkg_noun.linkLibC(); pkg_noun.addIncludePath(b.path("pkg/noun")); @@ -689,6 +693,7 @@ fn build_single( "jets/f/ut_nest.c", "jets/f/ut_rest.c", "jets/g/plot.c", + "jets/i/lagoon.c", "jets/tree.c", "log.c", "manage.c", diff --git a/build.zig.zon b/build.zig.zon index f39c710e39..3f6a8c07b7 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -49,6 +49,9 @@ .softfloat = .{ .path = "./ext/softfloat", }, + .softblas = .{ + .path = "./ext/softblas", + }, .unwind = .{ .path = "./ext/unwind", }, diff --git a/ext/gmp/build.zig b/ext/gmp/build.zig index efa83a9895..576e18aa7a 100644 --- a/ext/gmp/build.zig +++ b/ext/gmp/build.zig @@ -18,328 +18,6 @@ pub fn build(b: *std.Build) void { lib.linkLibC(); - // TODO: The values here should be provided programmatically - const config_h = b.addConfigHeader(.{ - .style = .{ - .autoconf = dep_c.path("config.in"), - }, - .include_path = "config.h", - }, .{ - .GMP_MPARAM_H_SUGGEST = "./mpn/arm64/gmp-mparam.h", - .HAVE_ALARM = 1, - .HAVE_ALLOCA = 1, - .HAVE_ALLOCA_H = 1, - .HAVE_ATTRIBUTE_CONST = 1, - .HAVE_ATTRIBUTE_MALLOC = 1, - .HAVE_ATTRIBUTE_MODE = 1, - .HAVE_ATTRIBUTE_NORETURN = 1, - .HAVE_CLOCK = 1, - .HAVE_CLOCK_GETTIME = 1, - .HAVE_DECL_FGETC = 1, - .HAVE_DECL_FSCANF = 1, - .HAVE_DECL_OPTARG = 1, - .HAVE_DECL_SYS_ERRLIST = 1, - .HAVE_DECL_SYS_NERR = 1, - .HAVE_DECL_UNGETC = 1, - .HAVE_DECL_VFPRINTF = 1, - .HAVE_DLFCN_H = 1, - .HAVE_DOUBLE_IEEE_LITTLE_ENDIAN = 1, - .HAVE_FCNTL_H = 1, - .HAVE_FLOAT_H = 1, - .HAVE_GETPAGESIZE = 1, - .HAVE_GETRUSAGE = 1, - .HAVE_GETTIMEOFDAY = 1, - .HAVE_INTMAX_T = 1, - .HAVE_INTPTR_T = 1, - .HAVE_INTTYPES_H = 1, - .HAVE_LANGINFO_H = 1, - .HAVE_LIMB_LITTLE_ENDIAN = 1, - .HAVE_LOCALECONV = 1, - .HAVE_LOCALE_H = 1, - .HAVE_LONG_DOUBLE = 1, - .HAVE_LONG_LONG = 1, - .HAVE_MEMORY_H = 1, - .HAVE_MEMSET = 1, - .HAVE_MMAP = 1, - .HAVE_MPROTECT = 1, - .HAVE_NATIVE_mpn_add_n = 1, - .HAVE_NATIVE_mpn_add_nc = 1, - .HAVE_NATIVE_mpn_addlsh1_n = 1, - .HAVE_NATIVE_mpn_addlsh2_n = 1, - .HAVE_NATIVE_mpn_and_n = 1, - .HAVE_NATIVE_mpn_andn_n = 1, - .HAVE_NATIVE_mpn_bdiv_dbm1c = 1, - .HAVE_NATIVE_mpn_bdiv_q_1 = 1, - .HAVE_NATIVE_mpn_pi1_bdiv_q_1 = 1, - .HAVE_NATIVE_mpn_cnd_add_n = 1, - .HAVE_NATIVE_mpn_cnd_sub_n = 1, - .HAVE_NATIVE_mpn_com = 1, - .HAVE_NATIVE_mpn_copyd = 1, - .HAVE_NATIVE_mpn_copyi = 1, - .HAVE_NATIVE_mpn_gcd_11 = 1, - .HAVE_NATIVE_mpn_gcd_22 = 1, - .HAVE_NATIVE_mpn_hamdist = 1, - .HAVE_NATIVE_mpn_invert_limb = 1, - .HAVE_NATIVE_mpn_ior_n = 1, - .HAVE_NATIVE_mpn_iorn_n = 1, - .HAVE_NATIVE_mpn_lshift = 1, - .HAVE_NATIVE_mpn_lshiftc = 1, - .HAVE_NATIVE_mpn_mod_34lsub1 = 1, - .HAVE_NATIVE_mpn_mul_1 = 1, - .HAVE_NATIVE_mpn_mul_1c = 1, - .HAVE_NATIVE_mpn_nand_n = 1, - .HAVE_NATIVE_mpn_nior_n = 1, - .HAVE_NATIVE_mpn_popcount = 1, - .HAVE_NATIVE_mpn_rsblsh1_n = 1, - .HAVE_NATIVE_mpn_rsblsh2_n = 1, - .HAVE_NATIVE_mpn_rsh1add_n = 1, - .HAVE_NATIVE_mpn_rsh1sub_n = 1, - .HAVE_NATIVE_mpn_rshift = 1, - .HAVE_NATIVE_mpn_sqr_diag_addlsh1 = 1, - .HAVE_NATIVE_mpn_sub_n = 1, - .HAVE_NATIVE_mpn_sub_nc = 1, - .HAVE_NATIVE_mpn_sublsh1_n = 1, - .HAVE_NATIVE_mpn_sublsh2_n = 1, - .HAVE_NATIVE_mpn_xor_n = 1, - .HAVE_NATIVE_mpn_xnor_n = 1, - .HAVE_NL_LANGINFO = 1, - .HAVE_NL_TYPES_H = 1, - .HAVE_POPEN = 1, - .HAVE_PROCESSOR_INFO = 1, - .HAVE_PTRDIFF_T = 1, - .HAVE_QUAD_T = 1, - .HAVE_RAISE = 1, - .HAVE_SIGACTION = 1, - .HAVE_SIGALTSTACK = 1, - .HAVE_STACK_T = 1, - .HAVE_STDINT_H = 1, - .HAVE_STDLIB_H = 1, - .HAVE_STRCHR = 1, - .HAVE_STRERROR = 1, - .HAVE_STRINGS_H = 1, - .HAVE_STRING_H = 1, - .HAVE_STRNLEN = 1, - .HAVE_STRTOL = 1, - .HAVE_STRTOUL = 1, - .HAVE_SYSCONF = 1, - .HAVE_SYSCTL = 1, - .HAVE_SYSCTLBYNAME = 1, - .HAVE_SYS_MMAN_H = 1, - .HAVE_SYS_PARAM_H = 1, - .HAVE_SYS_RESOURCE_H = 1, - .HAVE_SYS_STAT_H = 1, - .HAVE_SYS_SYSCTL_H = 1, - .HAVE_SYS_TIMES_H = 1, - .HAVE_SYS_TIME_H = 1, - .HAVE_SYS_TYPES_H = 1, - .HAVE_TIMES = 1, - .HAVE_UINT_LEAST32_T = 1, - .HAVE_UNISTD_H = 1, - .HAVE_VSNPRINTF = 1, - .LSYM_PREFIX = "L", - .LT_OBJDIR = ".libs/", - .PACKAGE = "gmp", - .PACKAGE_BUGREPORT = "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html", - .PACKAGE_NAME = "GNU MP", - .PACKAGE_STRING = "GNU MP 6.2.1", - .PACKAGE_TARNAME = "gmp", - .PACKAGE_URL = "http://www.gnu.org/software/gmp/", - .PACKAGE_VERSION = "6.2.1", - .RETSIGTYPE = null, - .SIZEOF_MP_LIMB_T = 8, - .SIZEOF_UNSIGNED = 4, - .SIZEOF_UNSIGNED_LONG = 8, - .SIZEOF_UNSIGNED_SHORT = 2, - .SIZEOF_VOID_P = 8, - .STDC_HEADERS = 1, - .TIME_WITH_SYS_TIME = 1, - .TUNE_SQR_TOOM2_MAX = "SQR_TOOM2_MAX_GENERIC", - .VERSION = "6.2.1", - .WANT_FFT = 1, - .WANT_TMP_ALLOCA = 1, - .YYTEXT_POINTER = 1, - .restrict = .__restrict, - .AC_APPLE_UNIVERSAL_BUILD = null, - .HAVE_ATTR_GET = null, - .HAVE_CALLING_CONVENTIONS = null, - .HAVE_CPUTIME = null, - .HAVE_DOUBLE_IEEE_BIG_ENDIAN = null, - .HAVE_DOUBLE_IEEE_LITTLE_SWAPPED = null, - .HAVE_DOUBLE_VAX_D = null, - .HAVE_DOUBLE_VAX_G = null, - .HAVE_DOUBLE_CRAY_CFP = null, - .HAVE_GETSYSINFO = null, - .HAVE_HIDDEN_ALIAS = null, - .HAVE_HOST_CPU_FAMILY_alpha = null, - .HAVE_HOST_CPU_FAMILY_m68k = null, - .HAVE_HOST_CPU_FAMILY_power = null, - .HAVE_HOST_CPU_FAMILY_powerpc = null, - .HAVE_HOST_CPU_FAMILY_x86 = null, - .HAVE_HOST_CPU_FAMILY_x86_64 = null, - .HAVE_HOST_CPU_alphaev67 = null, - .HAVE_HOST_CPU_alphaev68 = null, - .HAVE_HOST_CPU_alphaev7 = null, - .HAVE_HOST_CPU_m68020 = null, - .HAVE_HOST_CPU_m68030 = null, - .HAVE_HOST_CPU_m68040 = null, - .HAVE_HOST_CPU_m68060 = null, - .HAVE_HOST_CPU_m68360 = null, - .HAVE_HOST_CPU_powerpc604 = null, - .HAVE_HOST_CPU_powerpc604e = null, - .HAVE_HOST_CPU_powerpc750 = null, - .HAVE_HOST_CPU_powerpc7400 = null, - .HAVE_HOST_CPU_supersparc = null, - .HAVE_HOST_CPU_i386 = null, - .HAVE_HOST_CPU_i586 = null, - .HAVE_HOST_CPU_i686 = null, - .HAVE_HOST_CPU_pentium = null, - .HAVE_HOST_CPU_pentiummmx = null, - .HAVE_HOST_CPU_pentiumpro = null, - .HAVE_HOST_CPU_pentium2 = null, - .HAVE_HOST_CPU_pentium3 = null, - .HAVE_HOST_CPU_pentium4 = null, - .HAVE_HOST_CPU_core2 = null, - .HAVE_HOST_CPU_nehalem = null, - .HAVE_HOST_CPU_westmere = null, - .HAVE_HOST_CPU_sandybridge = null, - .HAVE_HOST_CPU_ivybridge = null, - .HAVE_HOST_CPU_haswell = null, - .HAVE_HOST_CPU_broadwell = null, - .HAVE_HOST_CPU_skylake = null, - .HAVE_HOST_CPU_silvermont = null, - .HAVE_HOST_CPU_goldmont = null, - .HAVE_HOST_CPU_k8 = null, - .HAVE_HOST_CPU_k10 = null, - .HAVE_HOST_CPU_bulldozer = null, - .HAVE_HOST_CPU_piledriver = null, - .HAVE_HOST_CPU_steamroller = null, - .HAVE_HOST_CPU_excavator = null, - .HAVE_HOST_CPU_zen = null, - .HAVE_HOST_CPU_bobcat = null, - .HAVE_HOST_CPU_jaguar = null, - .HAVE_HOST_CPU_s390_z900 = null, - .HAVE_HOST_CPU_s390_z990 = null, - .HAVE_HOST_CPU_s390_z9 = null, - .HAVE_HOST_CPU_s390_z10 = null, - .HAVE_HOST_CPU_s390_z196 = null, - .HAVE_HOST_CPU_s390_zarch = null, - .HAVE_INVENT_H = null, - .HAVE_LIMB_BIG_ENDIAN = null, - .HAVE_MACHINE_HAL_SYSINFO_H = null, - .HAVE_NATIVE_mpn_add_n_sub_n = null, - .HAVE_NATIVE_mpn_addaddmul_1msb0 = null, - .HAVE_NATIVE_mpn_addlsh_n = null, - .HAVE_NATIVE_mpn_addlsh1_nc = null, - .HAVE_NATIVE_mpn_addlsh2_nc = null, - .HAVE_NATIVE_mpn_addlsh_nc = null, - .HAVE_NATIVE_mpn_addlsh1_n_ip1 = null, - .HAVE_NATIVE_mpn_addlsh2_n_ip1 = null, - .HAVE_NATIVE_mpn_addlsh_n_ip1 = null, - .HAVE_NATIVE_mpn_addlsh1_nc_ip1 = null, - .HAVE_NATIVE_mpn_addlsh2_nc_ip1 = null, - .HAVE_NATIVE_mpn_addlsh_nc_ip1 = null, - .HAVE_NATIVE_mpn_addlsh1_n_ip2 = null, - .HAVE_NATIVE_mpn_addlsh2_n_ip2 = null, - .HAVE_NATIVE_mpn_addlsh_n_ip2 = null, - .HAVE_NATIVE_mpn_addlsh1_nc_ip2 = null, - .HAVE_NATIVE_mpn_addlsh2_nc_ip2 = null, - .HAVE_NATIVE_mpn_addlsh_nc_ip2 = null, - .HAVE_NATIVE_mpn_addmul_1c = null, - .HAVE_NATIVE_mpn_addmul_2 = null, - .HAVE_NATIVE_mpn_addmul_3 = null, - .HAVE_NATIVE_mpn_addmul_4 = null, - .HAVE_NATIVE_mpn_addmul_5 = null, - .HAVE_NATIVE_mpn_addmul_6 = null, - .HAVE_NATIVE_mpn_addmul_7 = null, - .HAVE_NATIVE_mpn_addmul_8 = null, - .HAVE_NATIVE_mpn_addmul_2s = null, - .HAVE_NATIVE_mpn_div_qr_1n_pi1 = null, - .HAVE_NATIVE_mpn_div_qr_2 = null, - .HAVE_NATIVE_mpn_divexact_1 = null, - .HAVE_NATIVE_mpn_divexact_by3c = null, - .HAVE_NATIVE_mpn_divrem_1 = null, - .HAVE_NATIVE_mpn_divrem_1c = null, - .HAVE_NATIVE_mpn_divrem_2 = null, - .HAVE_NATIVE_mpn_gcd_1 = null, - .HAVE_NATIVE_mpn_lshsub_n = null, - .HAVE_NATIVE_mpn_mod_1 = null, - .HAVE_NATIVE_mpn_mod_1_1p = null, - .HAVE_NATIVE_mpn_mod_1c = null, - .HAVE_NATIVE_mpn_mod_1s_2p = null, - .HAVE_NATIVE_mpn_mod_1s_4p = null, - .HAVE_NATIVE_mpn_modexact_1_odd = null, - .HAVE_NATIVE_mpn_modexact_1c_odd = null, - .HAVE_NATIVE_mpn_mul_2 = null, - .HAVE_NATIVE_mpn_mul_3 = null, - .HAVE_NATIVE_mpn_mul_4 = null, - .HAVE_NATIVE_mpn_mul_5 = null, - .HAVE_NATIVE_mpn_mul_6 = null, - .HAVE_NATIVE_mpn_mul_basecase = null, - .HAVE_NATIVE_mpn_mullo_basecase = null, - .HAVE_NATIVE_mpn_preinv_divrem_1 = null, - .HAVE_NATIVE_mpn_preinv_mod_1 = null, - .HAVE_NATIVE_mpn_redc_1 = null, - .HAVE_NATIVE_mpn_redc_2 = null, - .HAVE_NATIVE_mpn_rsblsh_n = null, - .HAVE_NATIVE_mpn_rsblsh1_nc = null, - .HAVE_NATIVE_mpn_rsblsh2_nc = null, - .HAVE_NATIVE_mpn_rsblsh_nc = null, - .HAVE_NATIVE_mpn_rsh1add_nc = null, - .HAVE_NATIVE_mpn_rsh1sub_nc = null, - .HAVE_NATIVE_mpn_sbpi1_bdiv_r = null, - .HAVE_NATIVE_mpn_sqr_basecase = null, - .HAVE_NATIVE_mpn_sqr_diagonal = null, - .HAVE_NATIVE_mpn_sublsh_n = null, - .HAVE_NATIVE_mpn_sublsh1_nc = null, - .HAVE_NATIVE_mpn_sublsh2_nc = null, - .HAVE_NATIVE_mpn_sublsh_nc = null, - .HAVE_NATIVE_mpn_sublsh1_n_ip1 = null, - .HAVE_NATIVE_mpn_sublsh2_n_ip1 = null, - .HAVE_NATIVE_mpn_sublsh_n_ip1 = null, - .HAVE_NATIVE_mpn_sublsh1_nc_ip1 = null, - .HAVE_NATIVE_mpn_sublsh2_nc_ip1 = null, - .HAVE_NATIVE_mpn_sublsh_nc_ip1 = null, - .HAVE_NATIVE_mpn_submul_1c = null, - .HAVE_NATIVE_mpn_tabselect = null, - .HAVE_NATIVE_mpn_udiv_qrnnd = null, - .HAVE_NATIVE_mpn_udiv_qrnnd_r = null, - .HAVE_NATIVE_mpn_umul_ppmm = null, - .HAVE_NATIVE_mpn_umul_ppmm_r = null, - .HAVE_OBSTACK_VPRINTF = null, - .HAVE_PSP_ITICKSPERCLKTICK = null, - .HAVE_PSTAT_GETPROCESSOR = null, - .HAVE_READ_REAL_TIME = null, - .HAVE_SIGSTACK = null, - .HAVE_SPEED_CYCLECOUNTER = null, - .HAVE_SSTREAM = null, - .HAVE_STD__LOCALE = null, - .HAVE_SYSSGI = null, - .HAVE_SYS_ATTRIBUTES_H = null, - .HAVE_SYS_IOGRAPH_H = null, - .HAVE_SYS_PROCESSOR_H = null, - .HAVE_SYS_PSTAT_H = null, - .HAVE_SYS_SYSINFO_H = null, - .HAVE_SYS_SYSSGI_H = null, - .HAVE_SYS_SYSTEMCFG_H = null, - .HOST_DOS64 = null, - .NO_ASM = null, - .SSCANF_WRITABLE_INPUT = null, - .WANT_ASSERT = null, - .WANT_FAKE_CPUID = null, - .WANT_FAT_BINARY = null, - .WANT_OLD_FFT_FULL = null, - .WANT_PROFILING_GPROF = null, - .WANT_PROFILING_INSTRUMENT = null, - .WANT_PROFILING_PROF = null, - .WANT_TMP_REENTRANT = null, - .WANT_TMP_NOTREENTRANT = null, - .WANT_TMP_DEBUG = null, - .WORDS_BIGENDIAN = null, - .X86_ASM_MULX = null, - .@"inline" = null, - .@"volatile" = null, - }); - // TODO: Finish this const gmp_h = b.addConfigHeader(.{ .style = .{ @@ -357,7 +35,6 @@ pub fn build(b: *std.Build) void { .CFLAGS = "-O2 -pedantic -march=armv8-a", }); - lib.addConfigHeader(config_h); lib.addConfigHeader(gmp_h); // Static headers @@ -685,6 +362,7 @@ const x86_64_linux_asm_sources = [_][]const u8{ "gen/x86_64-linux/mpn/mul_2.s", "gen/x86_64-linux/mpn/mul_basecase.s", "gen/x86_64-linux/mpn/mullo_basecase.s", + "gen/x86_64-linux/mpn/mulmid_basecase.s", "gen/x86_64-linux/mpn/nand_n.s", "gen/x86_64-linux/mpn/nior_n.s", "gen/x86_64-linux/mpn/popcount.s", @@ -703,7 +381,6 @@ const x86_64_linux_asm_sources = [_][]const u8{ "gen/x86_64-linux/mpn/sub_err3_n.s", "gen/x86_64-linux/mpn/sub_n.s", "gen/x86_64-linux/mpn/sublsh1_n.s", - "gen/x86_64-linux/mpn/sublsh2_n.s", "gen/x86_64-linux/mpn/submul_1.s", "gen/x86_64-linux/mpn/xnor_n.s", "gen/x86_64-linux/mpn/xor_n.s", @@ -886,6 +563,7 @@ const generic_c_sources = [_][]const u8{ "mpn/generic/mulmid_basecase.c", "mpn/generic/mulmid_n.c", "mpn/generic/mulmod_bnm1.c", + "mpn/generic/mulmod_bknp1.c", "mpn/generic/neg.c", "mpn/generic/nussbaumer_mul.c", "mpn/generic/perfpow.c", diff --git a/ext/gmp/build.zig.zon b/ext/gmp/build.zig.zon index 3349cef8a6..47538723d8 100644 --- a/ext/gmp/build.zig.zon +++ b/ext/gmp/build.zig.zon @@ -3,8 +3,8 @@ .version = "0.0.1", .dependencies = .{ .gmp = .{ - .url = "https://github.com/alisw/GMP/archive/refs/tags/v6.2.1.tar.gz", - .hash = "12209dd340fd48ad775604d2d4e95155dcf106b8f6c63dd054641d606e2007d806f4", + .url = "https://ftp.gnu.org/gnu/gmp/gmp-6.3.0.tar.gz", + .hash = "1220d46202c17aa35ab5848a7f7a812b797c9f07698f263c8a02b4ad9640a1bbe0e3", }, }, .paths = .{ diff --git a/ext/gmp/gen/README.md b/ext/gmp/gen/README.md index 0a9dab4266..52bdd96af7 100644 --- a/ext/gmp/gen/README.md +++ b/ext/gmp/gen/README.md @@ -1,8 +1,23 @@ # Generated architecture specific `.c`, `.s`, and `.h` files -To generate these, first run the `./configure` script under the unpacked GMP -dependency directory. Afterwards, navigate under `mpn/` and run the the -following to generate the assembly files: +To generate these, first run the `./configure` script under the unpacked GMP dependency directory with the following options: + +macOS: +```terminal +./configure --with-pic --disable-shared +``` + +linux-x86_64: +```terminal +./configure --with-pic --disable-shared --host=x86_64-linux-musl +``` + +linux-aarch64: +```terminal +./configure --with-pic --disable-shared --host=aarch64-linux-musl +``` + +Next, navigate under `mpn/` and run the following to generate the assembly files: ```bash for file in $(find . -maxdepth 1 -print | grep "\.asm$"); do @@ -22,6 +37,7 @@ Now, under the GMP root dir run `make` and copy these files as well: - `fac_table.h` - `fib_table.h` - `trialdivtab.h` +- `sieve_table.h` - `mpn/fib_table.c` - `mpn/jacobitab.h` - `mpn/mp_bases.c` diff --git a/ext/gmp/gen/aarch64-linux/config.h b/ext/gmp/gen/aarch64-linux/config.h new file mode 100644 index 0000000000..ad9547c325 --- /dev/null +++ b/ext/gmp/gen/aarch64-linux/config.h @@ -0,0 +1,672 @@ +/* config.h. Generated from config.in by configure. */ +/* config.in. Generated from configure.ac by autoheader. */ + +/* + +Copyright 1996-2022 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. +*/ + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* The gmp-mparam.h file (a string) the tune program should suggest updating. + */ +#define GMP_MPARAM_H_SUGGEST "./mpn/arm64/gmp-mparam.h" + +/* Define to 1 if you have the `alarm' function. */ +#define HAVE_ALARM 1 + +/* Define to 1 if alloca() works (via gmp-impl.h). */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#define HAVE_ALLOCA_H 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((const)) */ +#define HAVE_ATTRIBUTE_CONST 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((malloc)) */ +#define HAVE_ATTRIBUTE_MALLOC 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((mode (XX))) + */ +#define HAVE_ATTRIBUTE_MODE 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((noreturn)) */ +#define HAVE_ATTRIBUTE_NORETURN 1 + +/* Define to 1 if you have the `attr_get' function. */ +/* #undef HAVE_ATTR_GET */ + +/* Define to 1 if tests/libtests has calling conventions checking for the CPU + */ +/* #undef HAVE_CALLING_CONVENTIONS */ + +/* Define to 1 if you have the `clock' function. */ +#define HAVE_CLOCK 1 + +/* Define to 1 if you have the `clock_gettime' function */ +#define HAVE_CLOCK_GETTIME 1 + +/* Define to 1 if you have the `cputime' function. */ +/* #undef HAVE_CPUTIME */ + +/* Define to 1 if you have the declaration of `fgetc', and to 0 if you don't. + */ +#define HAVE_DECL_FGETC 1 + +/* Define to 1 if you have the declaration of `fscanf', and to 0 if you don't. + */ +#define HAVE_DECL_FSCANF 1 + +/* Define to 1 if you have the declaration of `optarg', and to 0 if you don't. + */ +#define HAVE_DECL_OPTARG 1 + +/* Define to 1 if you have the declaration of `sys_errlist', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_ERRLIST 0 + +/* Define to 1 if you have the declaration of `sys_nerr', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_NERR 0 + +/* Define to 1 if you have the declaration of `ungetc', and to 0 if you don't. + */ +#define HAVE_DECL_UNGETC 1 + +/* Define to 1 if you have the declaration of `vfprintf', and to 0 if you + don't. */ +#define HAVE_DECL_VFPRINTF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define one of the following to 1 for the format of a `double'. + If your format is not among these choices, or you don't know what it is, + then leave all undefined. + IEEE_LITTLE_SWAPPED means little endian, but with the two 4-byte halves + swapped, as used by ARM CPUs in little endian mode. */ +/* #undef HAVE_DOUBLE_IEEE_BIG_ENDIAN */ +#define HAVE_DOUBLE_IEEE_LITTLE_ENDIAN 1 +/* #undef HAVE_DOUBLE_IEEE_LITTLE_SWAPPED */ +/* #undef HAVE_DOUBLE_VAX_D */ +/* #undef HAVE_DOUBLE_VAX_G */ +/* #undef HAVE_DOUBLE_CRAY_CFP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FLOAT_H 1 + +/* Define to 1 if you have the `getpagesize' function. */ +#define HAVE_GETPAGESIZE 1 + +/* Define to 1 if you have the `getrusage' function. */ +#define HAVE_GETRUSAGE 1 + +/* Define to 1 if you have the `getsysinfo' function. */ +/* #undef HAVE_GETSYSINFO */ + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((visibility)) + and __attribute__ ((alias)) */ +#define HAVE_HIDDEN_ALIAS 1 + +/* Define one of these to 1 for the host CPU family. + If your CPU is not in any of these families, leave all undefined. + For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */ +/* #undef HAVE_HOST_CPU_FAMILY_alpha */ +/* #undef HAVE_HOST_CPU_FAMILY_m68k */ +/* #undef HAVE_HOST_CPU_FAMILY_power */ +/* #undef HAVE_HOST_CPU_FAMILY_powerpc */ +/* #undef HAVE_HOST_CPU_FAMILY_x86 */ +/* #undef HAVE_HOST_CPU_FAMILY_x86_64 */ + +/* Define one of the following to 1 for the host CPU, as per the output of + ./config.guess. If your CPU is not listed here, leave all undefined. */ +/* #undef HAVE_HOST_CPU_alphaev67 */ +/* #undef HAVE_HOST_CPU_alphaev68 */ +/* #undef HAVE_HOST_CPU_alphaev7 */ +/* #undef HAVE_HOST_CPU_m68020 */ +/* #undef HAVE_HOST_CPU_m68030 */ +/* #undef HAVE_HOST_CPU_m68040 */ +/* #undef HAVE_HOST_CPU_m68060 */ +/* #undef HAVE_HOST_CPU_m68360 */ +/* #undef HAVE_HOST_CPU_powerpc604 */ +/* #undef HAVE_HOST_CPU_powerpc604e */ +/* #undef HAVE_HOST_CPU_powerpc750 */ +/* #undef HAVE_HOST_CPU_powerpc7400 */ +/* #undef HAVE_HOST_CPU_supersparc */ +/* #undef HAVE_HOST_CPU_i386 */ +/* #undef HAVE_HOST_CPU_i586 */ +/* #undef HAVE_HOST_CPU_i686 */ +/* #undef HAVE_HOST_CPU_pentium */ +/* #undef HAVE_HOST_CPU_pentiummmx */ +/* #undef HAVE_HOST_CPU_pentiumpro */ +/* #undef HAVE_HOST_CPU_pentium2 */ +/* #undef HAVE_HOST_CPU_pentium3 */ +/* #undef HAVE_HOST_CPU_pentium4 */ +/* #undef HAVE_HOST_CPU_core2 */ +/* #undef HAVE_HOST_CPU_nehalem */ +/* #undef HAVE_HOST_CPU_westmere */ +/* #undef HAVE_HOST_CPU_sandybridge */ +/* #undef HAVE_HOST_CPU_ivybridge */ +/* #undef HAVE_HOST_CPU_haswell */ +/* #undef HAVE_HOST_CPU_broadwell */ +/* #undef HAVE_HOST_CPU_skylake */ +/* #undef HAVE_HOST_CPU_silvermont */ +/* #undef HAVE_HOST_CPU_goldmont */ +/* #undef HAVE_HOST_CPU_tremont */ +/* #undef HAVE_HOST_CPU_k8 */ +/* #undef HAVE_HOST_CPU_k10 */ +/* #undef HAVE_HOST_CPU_bulldozer */ +/* #undef HAVE_HOST_CPU_piledriver */ +/* #undef HAVE_HOST_CPU_steamroller */ +/* #undef HAVE_HOST_CPU_excavator */ +/* #undef HAVE_HOST_CPU_zen */ +/* #undef HAVE_HOST_CPU_bobcat */ +/* #undef HAVE_HOST_CPU_jaguar */ +/* #undef HAVE_HOST_CPU_s390_z900 */ +/* #undef HAVE_HOST_CPU_s390_z990 */ +/* #undef HAVE_HOST_CPU_s390_z9 */ +/* #undef HAVE_HOST_CPU_s390_z10 */ +/* #undef HAVE_HOST_CPU_s390_z196 */ +/* #undef HAVE_HOST_CPU_s390_z13 */ +/* #undef HAVE_HOST_CPU_s390_z14 */ +/* #undef HAVE_HOST_CPU_s390_z15 */ + +/* Define to 1 iff we have a s390 with 64-bit registers. */ +/* #undef HAVE_HOST_CPU_s390_zarch */ + +/* Define to 1 if the system has the type `intmax_t'. */ +#define HAVE_INTMAX_T 1 + +/* Define to 1 if the system has the type `intptr_t'. */ +#define HAVE_INTPTR_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_INVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LANGINFO_H 1 + +/* Define one of these to 1 for the endianness of `mp_limb_t'. + If the endianness is not a simple big or little, or you don't know what + it is, then leave both undefined. */ +/* #undef HAVE_LIMB_BIG_ENDIAN */ +#define HAVE_LIMB_LITTLE_ENDIAN 1 + +/* Define to 1 if you have the `localeconv' function. */ +#define HAVE_LOCALECONV 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if the system has the type `long double'. */ +#define HAVE_LONG_DOUBLE 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_MACHINE_HAL_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memset' function. */ +#define HAVE_MEMSET 1 + +/* Define to 1 if you have the `mmap' function. */ +#define HAVE_MMAP 1 + +/* Define to 1 if you have the `mprotect' function. */ +#define HAVE_MPROTECT 1 + +/* Define to 1 each of the following for which a native (ie. CPU specific) + implementation of the corresponding routine exists. */ +#define HAVE_NATIVE_mpn_add_n 1 +/* #undef HAVE_NATIVE_mpn_add_n_sub_n */ +#define HAVE_NATIVE_mpn_add_nc 1 +/* #undef HAVE_NATIVE_mpn_addaddmul_1msb0 */ +#define HAVE_NATIVE_mpn_addlsh1_n 1 +#define HAVE_NATIVE_mpn_addlsh2_n 1 +/* #undef HAVE_NATIVE_mpn_addlsh_n */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addmul_1c */ +/* #undef HAVE_NATIVE_mpn_addmul_2 */ +/* #undef HAVE_NATIVE_mpn_addmul_3 */ +/* #undef HAVE_NATIVE_mpn_addmul_4 */ +/* #undef HAVE_NATIVE_mpn_addmul_5 */ +/* #undef HAVE_NATIVE_mpn_addmul_6 */ +/* #undef HAVE_NATIVE_mpn_addmul_7 */ +/* #undef HAVE_NATIVE_mpn_addmul_8 */ +/* #undef HAVE_NATIVE_mpn_addmul_2s */ +#define HAVE_NATIVE_mpn_and_n 1 +#define HAVE_NATIVE_mpn_andn_n 1 +#define HAVE_NATIVE_mpn_bdiv_dbm1c 1 +#define HAVE_NATIVE_mpn_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_pi1_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_cnd_add_n 1 +#define HAVE_NATIVE_mpn_cnd_sub_n 1 +#define HAVE_NATIVE_mpn_com 1 +#define HAVE_NATIVE_mpn_copyd 1 +#define HAVE_NATIVE_mpn_copyi 1 +/* #undef HAVE_NATIVE_mpn_div_qr_1n_pi1 */ +/* #undef HAVE_NATIVE_mpn_div_qr_2 */ +/* #undef HAVE_NATIVE_mpn_divexact_1 */ +/* #undef HAVE_NATIVE_mpn_divexact_by3c */ +#define HAVE_NATIVE_mpn_divrem_1 1 +/* #undef HAVE_NATIVE_mpn_divrem_1c */ +/* #undef HAVE_NATIVE_mpn_divrem_2 */ +/* #undef HAVE_NATIVE_mpn_gcd_1 */ +#define HAVE_NATIVE_mpn_gcd_11 1 +#define HAVE_NATIVE_mpn_gcd_22 1 +#define HAVE_NATIVE_mpn_hamdist 1 +#define HAVE_NATIVE_mpn_invert_limb 1 +#define HAVE_NATIVE_mpn_ior_n 1 +#define HAVE_NATIVE_mpn_iorn_n 1 +#define HAVE_NATIVE_mpn_lshift 1 +#define HAVE_NATIVE_mpn_lshiftc 1 +/* #undef HAVE_NATIVE_mpn_lshsub_n */ +/* #undef HAVE_NATIVE_mpn_mod_1 */ +/* #undef HAVE_NATIVE_mpn_mod_1_1p */ +/* #undef HAVE_NATIVE_mpn_mod_1c */ +/* #undef HAVE_NATIVE_mpn_mod_1s_2p */ +/* #undef HAVE_NATIVE_mpn_mod_1s_4p */ +#define HAVE_NATIVE_mpn_mod_34lsub1 1 +/* #undef HAVE_NATIVE_mpn_modexact_1_odd */ +/* #undef HAVE_NATIVE_mpn_modexact_1c_odd */ +#define HAVE_NATIVE_mpn_mul_1 1 +#define HAVE_NATIVE_mpn_mul_1c 1 +/* #undef HAVE_NATIVE_mpn_mul_2 */ +/* #undef HAVE_NATIVE_mpn_mul_3 */ +/* #undef HAVE_NATIVE_mpn_mul_4 */ +/* #undef HAVE_NATIVE_mpn_mul_5 */ +/* #undef HAVE_NATIVE_mpn_mul_6 */ +/* #undef HAVE_NATIVE_mpn_mul_basecase */ +/* #undef HAVE_NATIVE_mpn_mullo_basecase */ +#define HAVE_NATIVE_mpn_nand_n 1 +#define HAVE_NATIVE_mpn_nior_n 1 +#define HAVE_NATIVE_mpn_popcount 1 +#define HAVE_NATIVE_mpn_preinv_divrem_1 1 +/* #undef HAVE_NATIVE_mpn_preinv_mod_1 */ +/* #undef HAVE_NATIVE_mpn_redc_1 */ +/* #undef HAVE_NATIVE_mpn_redc_2 */ +#define HAVE_NATIVE_mpn_rsblsh1_n 1 +#define HAVE_NATIVE_mpn_rsblsh2_n 1 +/* #undef HAVE_NATIVE_mpn_rsblsh_n */ +/* #undef HAVE_NATIVE_mpn_rsblsh1_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh2_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh_nc */ +#define HAVE_NATIVE_mpn_rsh1add_n 1 +/* #undef HAVE_NATIVE_mpn_rsh1add_nc */ +#define HAVE_NATIVE_mpn_rsh1sub_n 1 +/* #undef HAVE_NATIVE_mpn_rsh1sub_nc */ +#define HAVE_NATIVE_mpn_rshift 1 +/* #undef HAVE_NATIVE_mpn_sbpi1_bdiv_r */ +/* #undef HAVE_NATIVE_mpn_sqr_basecase */ +/* #undef HAVE_NATIVE_mpn_sqr_diagonal */ +#define HAVE_NATIVE_mpn_sqr_diag_addlsh1 1 +#define HAVE_NATIVE_mpn_sub_n 1 +#define HAVE_NATIVE_mpn_sub_nc 1 +#define HAVE_NATIVE_mpn_sublsh1_n 1 +#define HAVE_NATIVE_mpn_sublsh2_n 1 +/* #undef HAVE_NATIVE_mpn_sublsh_n */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_submul_1c */ +/* #undef HAVE_NATIVE_mpn_tabselect */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd_r */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm_r */ +#define HAVE_NATIVE_mpn_xor_n 1 +#define HAVE_NATIVE_mpn_xnor_n 1 + +/* Define to 1 if you have the `nl_langinfo' function. */ +#define HAVE_NL_LANGINFO 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_NL_TYPES_H 1 + +/* Define to 1 if you have the `obstack_vprintf' function. */ +/* #undef HAVE_OBSTACK_VPRINTF 1 */ + +/* Define to 1 if you have the `popen' function. */ +#define HAVE_POPEN 1 + +/* Define to 1 if you have the `processor_info' function. */ +/* #undef HAVE_PROCESSOR_INFO */ + +/* Define to 1 if `struct pst_processor' exists and contains + `psp_iticksperclktick'. */ +/* #undef HAVE_PSP_ITICKSPERCLKTICK */ + +/* Define to 1 if you have the `pstat_getprocessor' function. */ +/* #undef HAVE_PSTAT_GETPROCESSOR */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if the system has the type `quad_t'. */ +#define HAVE_QUAD_T 1 + +/* Define to 1 if you have the `raise' function. */ +#define HAVE_RAISE 1 + +/* Define to 1 if you have the `read_real_time' function. */ +/* #undef HAVE_READ_REAL_TIME */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `sigaltstack' function. */ +#define HAVE_SIGALTSTACK 1 + +/* Define to 1 if you have the `sigstack' function. */ +#define HAVE_SIGSTACK 1 + +/* Tune directory speed_cyclecounter, undef=none, 1=32bits, 2=64bits) */ +/* #undef HAVE_SPEED_CYCLECOUNTER */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SSTREAM */ + +/* Define to 1 if the system has the type `stack_t'. */ +#define HAVE_STACK_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if the system has the type `std::locale'. */ +/* #undef HAVE_STD__LOCALE */ + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strnlen' function. */ +#define HAVE_STRNLEN 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the `sysctl' function. */ +/* #undef HAVE_SYSCTL */ + +/* Define to 1 if you have the `sysctlbyname' function. */ +/* #undef HAVE_SYSCTLBYNAME */ + +/* Define to 1 if you have the `syssgi' function. */ +/* #undef HAVE_SYSSGI */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_ATTRIBUTES_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_IOGRAPH_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_MMAN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PROCESSOR_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PSTAT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSCTL_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSINFO_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSSGI_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSTEMCFG_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIMES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the `times' function. */ +#define HAVE_TIMES 1 + +/* Define to 1 if the system has the type `uint_least32_t'. */ +#define HAVE_UINT_LEAST32_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the `vsnprintf' function and it works properly. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 for Windos/64 */ +/* #undef HOST_DOS64 */ + +/* Assembler local label prefix */ +#define LSYM_PREFIX ".L" + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* Define to 1 to disable the use of inline assembly */ +/* #undef NO_ASM */ + +/* Name of package */ +#define PACKAGE "gmp" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org (see https://gmplib.org/manual/Reporting-Bugs.html)" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "GNU MP" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "GNU MP 6.3.0" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "gmp" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "http://www.gnu.org/software/gmp/" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "6.3.0" + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* The size of `mp_limb_t', as computed by sizeof. */ +#define SIZEOF_MP_LIMB_T 8 + +/* The size of `unsigned', as computed by sizeof. */ +#define SIZEOF_UNSIGNED 4 + +/* The size of `unsigned long', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_LONG 8 + +/* The size of `unsigned short', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_SHORT 2 + +/* The size of `void *', as computed by sizeof. */ +#define SIZEOF_VOID_P 8 + +/* Define to 1 if sscanf requires writable inputs */ +/* #undef SSCANF_WRITABLE_INPUT */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Maximum size the tune program can test for SQR_TOOM2_THRESHOLD */ +#define TUNE_SQR_TOOM2_MAX SQR_TOOM2_MAX_GENERIC + +/* Version number of package */ +#define VERSION "6.3.0" + +/* Define to 1 to enable ASSERT checking, per --enable-assert */ +/* #undef WANT_ASSERT */ + +/* Define to 1 to enable GMP_CPU_TYPE faking cpuid, per --enable-fake-cpuid */ +/* #undef WANT_FAKE_CPUID */ + +/* Define to 1 when building a fat binary. */ +/* #undef WANT_FAT_BINARY */ + +/* Define to 1 to enable FFTs for multiplication, per --enable-fft */ +#define WANT_FFT 1 + +/* Define to 1 to enable old mpn_mul_fft_full for multiplication, per + --enable-old-fft-full */ +/* #undef WANT_OLD_FFT_FULL */ + +/* Define to 1 if --enable-profiling=gprof */ +/* #undef WANT_PROFILING_GPROF */ + +/* Define to 1 if --enable-profiling=instrument */ +/* #undef WANT_PROFILING_INSTRUMENT */ + +/* Define to 1 if --enable-profiling=prof */ +/* #undef WANT_PROFILING_PROF */ + +/* Define one of these to 1 for the desired temporary memory allocation + method, per --enable-alloca. */ +#define WANT_TMP_ALLOCA 1 +/* #undef WANT_TMP_REENTRANT */ +/* #undef WANT_TMP_NOTREENTRANT */ +/* #undef WANT_TMP_DEBUG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to 1 if the assembler understands the mulx instruction */ +/* #undef X86_ASM_MULX */ + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +/* #undef YYTEXT_POINTER */ + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to the equivalent of the C99 'restrict' keyword, or to + nothing if this is not supported. Do not define if restrict is + supported directly. */ +#define restrict __restrict +/* Work around a bug in Sun C++: it does not support _Restrict or + __restrict__, even though the corresponding Sun C compiler ends up with + "#define restrict _Restrict" or "#define restrict __restrict__" in the + previous line. Perhaps some future version of Sun C++ will work with + restrict; if so, hopefully it defines __RESTRICT like Sun C does. */ +#if defined __SUNPRO_CC && !defined __RESTRICT +# define _Restrict +# define __restrict__ +#endif + +/* Define to empty if the keyword `volatile' does not work. Warning: valid + code using `volatile' can become incorrect without. Disable with care. */ +/* #undef volatile */ diff --git a/ext/gmp/gen/aarch64-linux/sieve_table.h b/ext/gmp/gen/aarch64-linux/sieve_table.h new file mode 100644 index 0000000000..ee9ac14360 --- /dev/null +++ b/ext/gmp/gen/aarch64-linux/sieve_table.h @@ -0,0 +1,46 @@ +/* This file generated by gen-sieve.c - DO NOT EDIT. */ + +#if GMP_LIMB_BITS != 64 +Error, error, this data is for 64 bits +#endif + +#define PRIMESIEVE_INIT_TABLE \ + CNST_LIMB (0x3294C9E069128480), /* 5 - 196 (42 primes) */ \ + CNST_LIMB (0x95A35E1EC4AB21DC), /* 197 - 388 (32 primes) */ \ + CNST_LIMB (0x4AD7CE99B8693366), /* 389 - 580 (30 primes) */ \ + CNST_LIMB (0x6595B6DA728DC52B), /* 581 - 772 (30 primes) */ \ + CNST_LIMB (0xEA6D9F8787B0CEDE), /* 773 - 964 (26 primes) */ \ + CNST_LIMB (0x3F56A1F4CD3275A9), /* 965 - 1156 (29 primes) */ \ + CNST_LIMB (0xFD3848FB74A76ADB), /* 1157 - 1348 (26 primes) */ \ + CNST_LIMB (0xDBBA0DD1A1EDF6AF), /* 1349 - 1540 (25 primes) */ \ + CNST_LIMB (0xCEC7F17ED22799A5), /* 1541 - 1732 (27 primes) */ \ + CNST_LIMB (0xEAEC17BDBB717D56), /* 1733 - 1924 (24 primes) */ \ + CNST_LIMB (0x3B0EB7B3585AFCF3), /* 1925 - 2116 (26 primes) */ \ + CNST_LIMB (0xE563D8F69FDF6C4F), /* 2117 - 2308 (23 primes) */ \ + CNST_LIMB (0xFE5BA7ABA45E92FC), /* 2309 - 2500 (25 primes) */ \ + CNST_LIMB (0x158DEE6F3BF49B7D), /* 2501 - 2692 (24 primes) */ \ + CNST_LIMB (0xBE5A7BC4EDE6CD1A), /* 2693 - 2884 (26 primes) */ \ + CNST_LIMB (0xD7679B3FCA7BB6AD), /* 2885 - 3076 (22 primes) */ \ + CNST_LIMB (0xC3F66B971FEF37E9), /* 3077 - 3268 (22 primes) */ \ + CNST_LIMB (0x6F7EBCF339C953FD), /* 3269 - 3460 (22 primes) */ \ + CNST_LIMB (0xD5A5ECDCD235DBF0), /* 3461 - 3652 (27 primes) */ \ + CNST_LIMB (0xECFA7B2FD5B65E3B), /* 3653 - 3844 (22 primes) */ \ + CNST_LIMB (0xD28EFDF9C89F67B1), /* 3845 - 4036 (25 primes) */ \ + CNST_LIMB (0xCB7F7C7A3DD3AF4F), /* 4037 - 4228 (21 primes) */ \ + CNST_LIMB (0xEEBED6CDFF6B32CC), /* 4229 - 4420 (22 primes) */ \ + CNST_LIMB (0xD5BD73F85ECFA97C), /* 4421 - 4612 (23 primes) */ \ + CNST_LIMB (0x21FDBE4FBBAD48F7), /* 4613 - 4804 (24 primes) */ \ + CNST_LIMB (0x5E35A3B5EEB7FDE7), /* 4805 - 4996 (21 primes) */ \ + CNST_LIMB (0xD9EBFD53A7DBBCC9), /* 4997 - 5188 (22 primes) */ \ + CNST_LIMB (0xFF9EDEAF2EFE1F76), /* 5189 - 5380 (18 primes) */ +#define PRIMESIEVE_NUMBEROF_TABLE 28 +/* #define PRIMESIEVE_PRIMES_IN_TABLE 706 */ +#define PRIMESIEVE_HIGHEST_PRIME 5351 +/* #define PRIMESIEVE_FIRST_UNCHECKED 5381 */ + +#define SIEVE_MASK1 CNST_LIMB(0x3204C1A049120485) +#define SIEVE_MASKT CNST_LIMB(0xA1204892058) +#define SIEVE_2MSK1 CNST_LIMB(0x29048402110840A) +#define SIEVE_2MSK2 CNST_LIMB(0x9402180C40230184) +#define SIEVE_2MSKT CNST_LIMB(0x5021088402120) + diff --git a/ext/gmp/gen/aarch64-macos/config.h b/ext/gmp/gen/aarch64-macos/config.h new file mode 100644 index 0000000000..dd1ca7f842 --- /dev/null +++ b/ext/gmp/gen/aarch64-macos/config.h @@ -0,0 +1,668 @@ +/* config.h. Generated from config.in by configure. */ +/* config.in. Generated from configure.ac by autoheader. */ + +/* + +Copyright 1996-2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. +*/ + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* The gmp-mparam.h file (a string) the tune program should suggest updating. + */ +#define GMP_MPARAM_H_SUGGEST "./mpn/arm64/gmp-mparam.h" + +/* Define to 1 if you have the `alarm' function. */ +#define HAVE_ALARM 1 + +/* Define to 1 if alloca() works (via gmp-impl.h). */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#define HAVE_ALLOCA_H 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((const)) */ +#define HAVE_ATTRIBUTE_CONST 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((malloc)) */ +#define HAVE_ATTRIBUTE_MALLOC 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((mode (XX))) + */ +#define HAVE_ATTRIBUTE_MODE 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((noreturn)) */ +#define HAVE_ATTRIBUTE_NORETURN 1 + +/* Define to 1 if you have the `attr_get' function. */ +/* #undef HAVE_ATTR_GET */ + +/* Define to 1 if tests/libtests has calling conventions checking for the CPU + */ +/* #undef HAVE_CALLING_CONVENTIONS */ + +/* Define to 1 if you have the `clock' function. */ +#define HAVE_CLOCK 1 + +/* Define to 1 if you have the `clock_gettime' function */ +#define HAVE_CLOCK_GETTIME 1 + +/* Define to 1 if you have the `cputime' function. */ +/* #undef HAVE_CPUTIME */ + +/* Define to 1 if you have the declaration of `fgetc', and to 0 if you don't. + */ +#define HAVE_DECL_FGETC 1 + +/* Define to 1 if you have the declaration of `fscanf', and to 0 if you don't. + */ +#define HAVE_DECL_FSCANF 1 + +/* Define to 1 if you have the declaration of `optarg', and to 0 if you don't. + */ +#define HAVE_DECL_OPTARG 1 + +/* Define to 1 if you have the declaration of `sys_errlist', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_ERRLIST 1 + +/* Define to 1 if you have the declaration of `sys_nerr', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_NERR 1 + +/* Define to 1 if you have the declaration of `ungetc', and to 0 if you don't. + */ +#define HAVE_DECL_UNGETC 1 + +/* Define to 1 if you have the declaration of `vfprintf', and to 0 if you + don't. */ +#define HAVE_DECL_VFPRINTF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define one of the following to 1 for the format of a `double'. + If your format is not among these choices, or you don't know what it is, + then leave all undefined. + IEEE_LITTLE_SWAPPED means little endian, but with the two 4-byte halves + swapped, as used by ARM CPUs in little endian mode. */ +/* #undef HAVE_DOUBLE_IEEE_BIG_ENDIAN */ +#define HAVE_DOUBLE_IEEE_LITTLE_ENDIAN 1 +/* #undef HAVE_DOUBLE_IEEE_LITTLE_SWAPPED */ +/* #undef HAVE_DOUBLE_VAX_D */ +/* #undef HAVE_DOUBLE_VAX_G */ +/* #undef HAVE_DOUBLE_CRAY_CFP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FLOAT_H 1 + +/* Define to 1 if you have the `getpagesize' function. */ +#define HAVE_GETPAGESIZE 1 + +/* Define to 1 if you have the `getrusage' function. */ +#define HAVE_GETRUSAGE 1 + +/* Define to 1 if you have the `getsysinfo' function. */ +/* #undef HAVE_GETSYSINFO */ + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((visibility)) + and __attribute__ ((alias)) */ +/* #undef HAVE_HIDDEN_ALIAS */ + +/* Define one of these to 1 for the host CPU family. + If your CPU is not in any of these families, leave all undefined. + For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */ +/* #undef HAVE_HOST_CPU_FAMILY_alpha */ +/* #undef HAVE_HOST_CPU_FAMILY_m68k */ +/* #undef HAVE_HOST_CPU_FAMILY_power */ +/* #undef HAVE_HOST_CPU_FAMILY_powerpc */ +/* #undef HAVE_HOST_CPU_FAMILY_x86 */ +/* #undef HAVE_HOST_CPU_FAMILY_x86_64 */ + +/* Define one of the following to 1 for the host CPU, as per the output of + ./config.guess. If your CPU is not listed here, leave all undefined. */ +/* #undef HAVE_HOST_CPU_alphaev67 */ +/* #undef HAVE_HOST_CPU_alphaev68 */ +/* #undef HAVE_HOST_CPU_alphaev7 */ +/* #undef HAVE_HOST_CPU_m68020 */ +/* #undef HAVE_HOST_CPU_m68030 */ +/* #undef HAVE_HOST_CPU_m68040 */ +/* #undef HAVE_HOST_CPU_m68060 */ +/* #undef HAVE_HOST_CPU_m68360 */ +/* #undef HAVE_HOST_CPU_powerpc604 */ +/* #undef HAVE_HOST_CPU_powerpc604e */ +/* #undef HAVE_HOST_CPU_powerpc750 */ +/* #undef HAVE_HOST_CPU_powerpc7400 */ +/* #undef HAVE_HOST_CPU_supersparc */ +/* #undef HAVE_HOST_CPU_i386 */ +/* #undef HAVE_HOST_CPU_i586 */ +/* #undef HAVE_HOST_CPU_i686 */ +/* #undef HAVE_HOST_CPU_pentium */ +/* #undef HAVE_HOST_CPU_pentiummmx */ +/* #undef HAVE_HOST_CPU_pentiumpro */ +/* #undef HAVE_HOST_CPU_pentium2 */ +/* #undef HAVE_HOST_CPU_pentium3 */ +/* #undef HAVE_HOST_CPU_pentium4 */ +/* #undef HAVE_HOST_CPU_core2 */ +/* #undef HAVE_HOST_CPU_nehalem */ +/* #undef HAVE_HOST_CPU_westmere */ +/* #undef HAVE_HOST_CPU_sandybridge */ +/* #undef HAVE_HOST_CPU_ivybridge */ +/* #undef HAVE_HOST_CPU_haswell */ +/* #undef HAVE_HOST_CPU_broadwell */ +/* #undef HAVE_HOST_CPU_skylake */ +/* #undef HAVE_HOST_CPU_silvermont */ +/* #undef HAVE_HOST_CPU_goldmont */ +/* #undef HAVE_HOST_CPU_k8 */ +/* #undef HAVE_HOST_CPU_k10 */ +/* #undef HAVE_HOST_CPU_bulldozer */ +/* #undef HAVE_HOST_CPU_piledriver */ +/* #undef HAVE_HOST_CPU_steamroller */ +/* #undef HAVE_HOST_CPU_excavator */ +/* #undef HAVE_HOST_CPU_zen */ +/* #undef HAVE_HOST_CPU_bobcat */ +/* #undef HAVE_HOST_CPU_jaguar */ +/* #undef HAVE_HOST_CPU_s390_z900 */ +/* #undef HAVE_HOST_CPU_s390_z990 */ +/* #undef HAVE_HOST_CPU_s390_z9 */ +/* #undef HAVE_HOST_CPU_s390_z10 */ +/* #undef HAVE_HOST_CPU_s390_z196 */ + +/* Define to 1 iff we have a s390 with 64-bit registers. */ +/* #undef HAVE_HOST_CPU_s390_zarch */ + +/* Define to 1 if the system has the type `intmax_t'. */ +#define HAVE_INTMAX_T 1 + +/* Define to 1 if the system has the type `intptr_t'. */ +#define HAVE_INTPTR_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_INVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LANGINFO_H 1 + +/* Define one of these to 1 for the endianness of `mp_limb_t'. + If the endianness is not a simple big or little, or you don't know what + it is, then leave both undefined. */ +/* #undef HAVE_LIMB_BIG_ENDIAN */ +#define HAVE_LIMB_LITTLE_ENDIAN 1 + +/* Define to 1 if you have the `localeconv' function. */ +#define HAVE_LOCALECONV 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if the system has the type `long double'. */ +#define HAVE_LONG_DOUBLE 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_MACHINE_HAL_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memset' function. */ +#define HAVE_MEMSET 1 + +/* Define to 1 if you have the `mmap' function. */ +#define HAVE_MMAP 1 + +/* Define to 1 if you have the `mprotect' function. */ +#define HAVE_MPROTECT 1 + +/* Define to 1 each of the following for which a native (ie. CPU specific) + implementation of the corresponding routine exists. */ +#define HAVE_NATIVE_mpn_add_n 1 +/* #undef HAVE_NATIVE_mpn_add_n_sub_n */ +#define HAVE_NATIVE_mpn_add_nc 1 +/* #undef HAVE_NATIVE_mpn_addaddmul_1msb0 */ +#define HAVE_NATIVE_mpn_addlsh1_n 1 +#define HAVE_NATIVE_mpn_addlsh2_n 1 +/* #undef HAVE_NATIVE_mpn_addlsh_n */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addmul_1c */ +/* #undef HAVE_NATIVE_mpn_addmul_2 */ +/* #undef HAVE_NATIVE_mpn_addmul_3 */ +/* #undef HAVE_NATIVE_mpn_addmul_4 */ +/* #undef HAVE_NATIVE_mpn_addmul_5 */ +/* #undef HAVE_NATIVE_mpn_addmul_6 */ +/* #undef HAVE_NATIVE_mpn_addmul_7 */ +/* #undef HAVE_NATIVE_mpn_addmul_8 */ +/* #undef HAVE_NATIVE_mpn_addmul_2s */ +#define HAVE_NATIVE_mpn_and_n 1 +#define HAVE_NATIVE_mpn_andn_n 1 +#define HAVE_NATIVE_mpn_bdiv_dbm1c 1 +#define HAVE_NATIVE_mpn_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_pi1_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_cnd_add_n 1 +#define HAVE_NATIVE_mpn_cnd_sub_n 1 +#define HAVE_NATIVE_mpn_com 1 +#define HAVE_NATIVE_mpn_copyd 1 +#define HAVE_NATIVE_mpn_copyi 1 +/* #undef HAVE_NATIVE_mpn_div_qr_1n_pi1 */ +/* #undef HAVE_NATIVE_mpn_div_qr_2 */ +/* #undef HAVE_NATIVE_mpn_divexact_1 */ +/* #undef HAVE_NATIVE_mpn_divexact_by3c */ +/* #undef HAVE_NATIVE_mpn_divrem_1 */ +/* #undef HAVE_NATIVE_mpn_divrem_1c */ +/* #undef HAVE_NATIVE_mpn_divrem_2 */ +/* #undef HAVE_NATIVE_mpn_gcd_1 */ +#define HAVE_NATIVE_mpn_gcd_11 1 +#define HAVE_NATIVE_mpn_gcd_22 1 +#define HAVE_NATIVE_mpn_hamdist 1 +#define HAVE_NATIVE_mpn_invert_limb 1 +#define HAVE_NATIVE_mpn_ior_n 1 +#define HAVE_NATIVE_mpn_iorn_n 1 +#define HAVE_NATIVE_mpn_lshift 1 +#define HAVE_NATIVE_mpn_lshiftc 1 +/* #undef HAVE_NATIVE_mpn_lshsub_n */ +/* #undef HAVE_NATIVE_mpn_mod_1 */ +/* #undef HAVE_NATIVE_mpn_mod_1_1p */ +/* #undef HAVE_NATIVE_mpn_mod_1c */ +/* #undef HAVE_NATIVE_mpn_mod_1s_2p */ +/* #undef HAVE_NATIVE_mpn_mod_1s_4p */ +#define HAVE_NATIVE_mpn_mod_34lsub1 1 +/* #undef HAVE_NATIVE_mpn_modexact_1_odd */ +/* #undef HAVE_NATIVE_mpn_modexact_1c_odd */ +#define HAVE_NATIVE_mpn_mul_1 1 +#define HAVE_NATIVE_mpn_mul_1c 1 +/* #undef HAVE_NATIVE_mpn_mul_2 */ +/* #undef HAVE_NATIVE_mpn_mul_3 */ +/* #undef HAVE_NATIVE_mpn_mul_4 */ +/* #undef HAVE_NATIVE_mpn_mul_5 */ +/* #undef HAVE_NATIVE_mpn_mul_6 */ +/* #undef HAVE_NATIVE_mpn_mul_basecase */ +/* #undef HAVE_NATIVE_mpn_mullo_basecase */ +#define HAVE_NATIVE_mpn_nand_n 1 +#define HAVE_NATIVE_mpn_nior_n 1 +#define HAVE_NATIVE_mpn_popcount 1 +/* #undef HAVE_NATIVE_mpn_preinv_divrem_1 */ +/* #undef HAVE_NATIVE_mpn_preinv_mod_1 */ +/* #undef HAVE_NATIVE_mpn_redc_1 */ +/* #undef HAVE_NATIVE_mpn_redc_2 */ +#define HAVE_NATIVE_mpn_rsblsh1_n 1 +#define HAVE_NATIVE_mpn_rsblsh2_n 1 +/* #undef HAVE_NATIVE_mpn_rsblsh_n */ +/* #undef HAVE_NATIVE_mpn_rsblsh1_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh2_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh_nc */ +#define HAVE_NATIVE_mpn_rsh1add_n 1 +/* #undef HAVE_NATIVE_mpn_rsh1add_nc */ +#define HAVE_NATIVE_mpn_rsh1sub_n 1 +/* #undef HAVE_NATIVE_mpn_rsh1sub_nc */ +#define HAVE_NATIVE_mpn_rshift 1 +/* #undef HAVE_NATIVE_mpn_sbpi1_bdiv_r */ +/* #undef HAVE_NATIVE_mpn_sqr_basecase */ +/* #undef HAVE_NATIVE_mpn_sqr_diagonal */ +#define HAVE_NATIVE_mpn_sqr_diag_addlsh1 1 +#define HAVE_NATIVE_mpn_sub_n 1 +#define HAVE_NATIVE_mpn_sub_nc 1 +#define HAVE_NATIVE_mpn_sublsh1_n 1 +#define HAVE_NATIVE_mpn_sublsh2_n 1 +/* #undef HAVE_NATIVE_mpn_sublsh_n */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_submul_1c */ +/* #undef HAVE_NATIVE_mpn_tabselect */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd_r */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm_r */ +#define HAVE_NATIVE_mpn_xor_n 1 +#define HAVE_NATIVE_mpn_xnor_n 1 + +/* Define to 1 if you have the `nl_langinfo' function. */ +#define HAVE_NL_LANGINFO 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_NL_TYPES_H 1 + +/* Define to 1 if you have the `obstack_vprintf' function. */ +/* #undef HAVE_OBSTACK_VPRINTF */ + +/* Define to 1 if you have the `popen' function. */ +#define HAVE_POPEN 1 + +/* Define to 1 if you have the `processor_info' function. */ +#define HAVE_PROCESSOR_INFO 1 + +/* Define to 1 if `struct pst_processor' exists and contains + `psp_iticksperclktick'. */ +/* #undef HAVE_PSP_ITICKSPERCLKTICK */ + +/* Define to 1 if you have the `pstat_getprocessor' function. */ +/* #undef HAVE_PSTAT_GETPROCESSOR */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if the system has the type `quad_t'. */ +#define HAVE_QUAD_T 1 + +/* Define to 1 if you have the `raise' function. */ +#define HAVE_RAISE 1 + +/* Define to 1 if you have the `read_real_time' function. */ +/* #undef HAVE_READ_REAL_TIME */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `sigaltstack' function. */ +#define HAVE_SIGALTSTACK 1 + +/* Define to 1 if you have the `sigstack' function. */ +/* #undef HAVE_SIGSTACK */ + +/* Tune directory speed_cyclecounter, undef=none, 1=32bits, 2=64bits) */ +/* #undef HAVE_SPEED_CYCLECOUNTER */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SSTREAM */ + +/* Define to 1 if the system has the type `stack_t'. */ +#define HAVE_STACK_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if the system has the type `std::locale'. */ +/* #undef HAVE_STD__LOCALE */ + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strnlen' function. */ +#define HAVE_STRNLEN 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the `sysctl' function. */ +#define HAVE_SYSCTL 1 + +/* Define to 1 if you have the `sysctlbyname' function. */ +#define HAVE_SYSCTLBYNAME 1 + +/* Define to 1 if you have the `syssgi' function. */ +/* #undef HAVE_SYSSGI */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_ATTRIBUTES_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_IOGRAPH_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_MMAN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PROCESSOR_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PSTAT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSCTL_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSSGI_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSTEMCFG_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIMES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the `times' function. */ +#define HAVE_TIMES 1 + +/* Define to 1 if the system has the type `uint_least32_t'. */ +#define HAVE_UINT_LEAST32_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the `vsnprintf' function and it works properly. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 for Windos/64 */ +/* #undef HOST_DOS64 */ + +/* Assembler local label prefix */ +#define LSYM_PREFIX "L" + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* Define to 1 to disable the use of inline assembly */ +/* #undef NO_ASM */ + +/* Name of package */ +#define PACKAGE "gmp" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "GNU MP" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "GNU MP 6.2.1" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "gmp" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "http://www.gnu.org/software/gmp/" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "6.2.1" + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* The size of `mp_limb_t', as computed by sizeof. */ +#define SIZEOF_MP_LIMB_T 8 + +/* The size of `unsigned', as computed by sizeof. */ +#define SIZEOF_UNSIGNED 4 + +/* The size of `unsigned long', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_LONG 8 + +/* The size of `unsigned short', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_SHORT 2 + +/* The size of `void *', as computed by sizeof. */ +#define SIZEOF_VOID_P 8 + +/* Define to 1 if sscanf requires writable inputs */ +/* #undef SSCANF_WRITABLE_INPUT */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Maximum size the tune program can test for SQR_TOOM2_THRESHOLD */ +#define TUNE_SQR_TOOM2_MAX SQR_TOOM2_MAX_GENERIC + +/* Version number of package */ +#define VERSION "6.2.1" + +/* Define to 1 to enable ASSERT checking, per --enable-assert */ +/* #undef WANT_ASSERT */ + +/* Define to 1 to enable GMP_CPU_TYPE faking cpuid, per --enable-fake-cpuid */ +/* #undef WANT_FAKE_CPUID */ + +/* Define to 1 when building a fat binary. */ +/* #undef WANT_FAT_BINARY */ + +/* Define to 1 to enable FFTs for multiplication, per --enable-fft */ +#define WANT_FFT 1 + +/* Define to 1 to enable old mpn_mul_fft_full for multiplication, per + --enable-old-fft-full */ +/* #undef WANT_OLD_FFT_FULL */ + +/* Define to 1 if --enable-profiling=gprof */ +/* #undef WANT_PROFILING_GPROF */ + +/* Define to 1 if --enable-profiling=instrument */ +/* #undef WANT_PROFILING_INSTRUMENT */ + +/* Define to 1 if --enable-profiling=prof */ +/* #undef WANT_PROFILING_PROF */ + +/* Define one of these to 1 for the desired temporary memory allocation + method, per --enable-alloca. */ +#define WANT_TMP_ALLOCA 1 +/* #undef WANT_TMP_REENTRANT */ +/* #undef WANT_TMP_NOTREENTRANT */ +/* #undef WANT_TMP_DEBUG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to 1 if the assembler understands the mulx instruction */ +/* #undef X86_ASM_MULX */ + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +#define YYTEXT_POINTER 1 + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to the equivalent of the C99 'restrict' keyword, or to + nothing if this is not supported. Do not define if restrict is + supported directly. */ +#define restrict __restrict +/* Work around a bug in Sun C++: it does not support _Restrict or + __restrict__, even though the corresponding Sun C compiler ends up with + "#define restrict _Restrict" or "#define restrict __restrict__" in the + previous line. Perhaps some future version of Sun C++ will work with + restrict; if so, hopefully it defines __RESTRICT like Sun C does. */ +#if defined __SUNPRO_CC && !defined __RESTRICT +# define _Restrict +# define __restrict__ +#endif + +/* Define to empty if the keyword `volatile' does not work. Warning: valid + code using `volatile' can become incorrect without. Disable with care. */ +/* #undef volatile */ diff --git a/ext/gmp/gen/aarch64-macos/mpn/add_n.s b/ext/gmp/gen/aarch64-macos/mpn/add_n.s index 136fdacc83..5f2b539e98 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/add_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/add_n.s @@ -77,7 +77,7 @@ ___gmpn_add_nc: ___gmpn_add_n: cmn xzr, xzr -Lent: lsr x18, x3, #2 +Lent: lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -86,7 +86,7 @@ Lbx1: ldr x7, [x1] str x13, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -97,7 +97,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -110,7 +110,7 @@ Lb00: ldp x4, x5, [x1] Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -123,8 +123,8 @@ Lmid: ldp x6, x7, [x1,#32]! adcs x12, x4, x8 adcs x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: adcs x12, x6, x10 adcs x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/addlsh1_n.s b/ext/gmp/gen/aarch64-macos/mpn/addlsh1_n.s index 9cde4af8e2..bf0bdcbeb9 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/addlsh1_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/addlsh1_n.s @@ -86,14 +86,14 @@ .globl ___gmpn_addlsh1_n ___gmpn_addlsh1_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x5, [x1] tbnz x3, #1, Lb11 Lb01: ldr x11, [x2] - cbz x18, L1 + cbz x6, L1 ldp x8, x9, [x2,#8] lsl x13, x11, #1 adds x15, x13, x5 @@ -115,7 +115,7 @@ Lb11: ldr x9, [x2] adds x17, x13, x5 str x17, [x0],#8 sub x1, x1, #8 - cbz x18, Lend + cbz x6, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -128,7 +128,7 @@ Lb00: adds x11, xzr, xzr Lb10: adds x9, xzr, xzr ldp x10, x11, [x2] sub x1, x1, #16 - cbz x18, Lend + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -145,8 +145,8 @@ Lmid: ldp x4, x5, [x1,#32]! adcs x16, x12, x4 adcs x17, x13, x5 stp x16, x17, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: ldp x4, x5, [x1,#16] extr x12, x10, x9, #63 diff --git a/ext/gmp/gen/aarch64-macos/mpn/addlsh2_n.s b/ext/gmp/gen/aarch64-macos/mpn/addlsh2_n.s index f923e69202..e167b893e7 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/addlsh2_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/addlsh2_n.s @@ -86,14 +86,14 @@ .globl ___gmpn_addlsh2_n ___gmpn_addlsh2_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x5, [x1] tbnz x3, #1, Lb11 Lb01: ldr x11, [x2] - cbz x18, L1 + cbz x6, L1 ldp x8, x9, [x2,#8] lsl x13, x11, #2 adds x15, x13, x5 @@ -115,7 +115,7 @@ Lb11: ldr x9, [x2] adds x17, x13, x5 str x17, [x0],#8 sub x1, x1, #8 - cbz x18, Lend + cbz x6, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -128,7 +128,7 @@ Lb00: adds x11, xzr, xzr Lb10: adds x9, xzr, xzr ldp x10, x11, [x2] sub x1, x1, #16 - cbz x18, Lend + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -145,8 +145,8 @@ Lmid: ldp x4, x5, [x1,#32]! adcs x16, x12, x4 adcs x17, x13, x5 stp x16, x17, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: ldp x4, x5, [x1,#16] extr x12, x10, x9, #62 diff --git a/ext/gmp/gen/aarch64-macos/mpn/addmul_1.s b/ext/gmp/gen/aarch64-macos/mpn/addmul_1.s index 6c82fb3de6..09ce6a3921 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/addmul_1.s +++ b/ext/gmp/gen/aarch64-macos/mpn/addmul_1.s @@ -53,6 +53,11 @@ + + + + + diff --git a/ext/gmp/gen/aarch64-macos/mpn/and_n.s b/ext/gmp/gen/aarch64-macos/mpn/and_n.s index c6d0d23cb0..6967e35beb 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/and_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/and_n.s @@ -73,7 +73,7 @@ .globl ___gmpn_and_n ___gmpn_and_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -82,7 +82,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -93,7 +93,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -104,7 +104,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -117,8 +117,8 @@ Lmid: ldp x6, x7, [x1,#32]! and x12, x4, x8 and x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: and x12, x6, x10 and x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/andn_n.s b/ext/gmp/gen/aarch64-macos/mpn/andn_n.s index 9d2318f0a3..6f928f10bc 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/andn_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/andn_n.s @@ -73,7 +73,7 @@ .globl ___gmpn_andn_n ___gmpn_andn_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -82,7 +82,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -93,7 +93,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -104,7 +104,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -117,8 +117,8 @@ Lmid: ldp x6, x7, [x1,#32]! bic x12, x4, x8 bic x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: bic x12, x6, x10 bic x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/cnd_add_n.s b/ext/gmp/gen/aarch64-macos/mpn/cnd_add_n.s index 1282e0241d..606d6c4948 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/cnd_add_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/cnd_add_n.s @@ -72,7 +72,7 @@ ___gmpn_cnd_add_n: cmn xzr, xzr - lsr x18, x4, #2 + lsr x17, x4, #2 tbz x4, #0, Lbx0 Lbx1: ldr x13, [x3] @@ -82,7 +82,7 @@ Lbx1: ldr x13, [x3] str x9, [x1] tbnz x4, #1, Lb11 -Lb01: cbz x18, Lrt +Lb01: cbz x17, Lrt ldp x12, x13, [x3,#8] ldp x10, x11, [x2,#8] sub x2, x2, #8 @@ -93,7 +93,7 @@ Lb01: cbz x18, Lrt Lb11: ldp x12, x13, [x3,#8]! ldp x10, x11, [x2,#8]! sub x1, x1, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: ldp x12, x13, [x3] @@ -106,7 +106,7 @@ Lb00: sub x2, x2, #16 b Lmid Lb10: sub x1, x1, #16 - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: bic x6, x12, x0 @@ -123,8 +123,8 @@ Lmid: bic x6, x12, x0 adcs x9, x11, x7 ldp x10, x11, [x2,#32]! stp x8, x9, [x1,#32]! - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: bic x6, x12, x0 bic x7, x13, x0 diff --git a/ext/gmp/gen/aarch64-macos/mpn/cnd_sub_n.s b/ext/gmp/gen/aarch64-macos/mpn/cnd_sub_n.s index 5663667b12..be253fe14d 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/cnd_sub_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/cnd_sub_n.s @@ -72,7 +72,7 @@ ___gmpn_cnd_sub_n: cmp xzr, xzr - lsr x18, x4, #2 + lsr x17, x4, #2 tbz x4, #0, Lbx0 Lbx1: ldr x13, [x3] @@ -82,7 +82,7 @@ Lbx1: ldr x13, [x3] str x9, [x1] tbnz x4, #1, Lb11 -Lb01: cbz x18, Lrt +Lb01: cbz x17, Lrt ldp x12, x13, [x3,#8] ldp x10, x11, [x2,#8] sub x2, x2, #8 @@ -93,7 +93,7 @@ Lb01: cbz x18, Lrt Lb11: ldp x12, x13, [x3,#8]! ldp x10, x11, [x2,#8]! sub x1, x1, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: ldp x12, x13, [x3] @@ -106,7 +106,7 @@ Lb00: sub x2, x2, #16 b Lmid Lb10: sub x1, x1, #16 - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: bic x6, x12, x0 @@ -123,8 +123,8 @@ Lmid: bic x6, x12, x0 sbcs x9, x11, x7 ldp x10, x11, [x2,#32]! stp x8, x9, [x1,#32]! - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: bic x6, x12, x0 bic x7, x13, x0 diff --git a/ext/gmp/gen/aarch64-macos/mpn/com.s b/ext/gmp/gen/aarch64-macos/mpn/com.s index 656c761ce6..5eaf1f4972 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/com.s +++ b/ext/gmp/gen/aarch64-macos/mpn/com.s @@ -45,6 +45,10 @@ + + + + @@ -60,37 +64,41 @@ ___gmpn_com: tbz x0, #3, Lal2 - ld1 {v22.1d}, [x1], #8 + ldr x4, [x1],#8 sub x2, x2, #1 - mvn v22.8b, v22.8b - st1 {v22.1d}, [x0], #8 + mvn x4, x4 + str x4, [x0],#8 -Lal2: ld1 {v26.2d}, [x1], #16 - subs x2, x2, #6 - b.lt Lend +Lal2: ldp x4,x5, [x1],#16 + sub x2, x2, #6 + tbnz x2, #63, Lend .align 4 -Ltop: ld1 {v22.2d}, [x1], #16 - mvn v26.16b, v26.16b - st1 {v26.2d}, [x0], #16 - ld1 {v26.2d}, [x1], #16 - mvn v22.16b, v22.16b - st1 {v22.2d}, [x0], #16 - subs x2, x2, #4 - b.ge Ltop +Ltop: ldp x6,x7, [x1],#32 + mvn x4, x4 + mvn x5, x5 + stp x4,x5, [x0],#32 + ldp x4,x5, [x1,#-16] + mvn x6, x6 + mvn x7, x7 + stp x6,x7, [x0,#-16] + sub x2, x2, #4 + tbz x2, #63, Ltop -Lend: mvn v26.16b, v26.16b - st1 {v26.2d}, [x0], #16 +Lend: mvn x4, x4 + mvn x5, x5 + stp x4,x5, [x0],#16 Lbc: tbz x2, #1, Ltl1 - ld1 {v22.2d}, [x1], #16 - mvn v22.16b, v22.16b - st1 {v22.2d}, [x0], #16 + ldp x4,x5, [x1],#16 + mvn x4, x4 + mvn x5, x5 + stp x4,x5, [x0],#16 Ltl1: tbz x2, #0, Ltl2 - ld1 {v22.1d}, [x1] - mvn v22.8b, v22.8b - st1 {v22.1d}, [x0] + ldr x4, [x1] + mvn x4, x4 + str x4, [x0] Ltl2: ret diff --git a/ext/gmp/gen/aarch64-macos/mpn/copyd.s b/ext/gmp/gen/aarch64-macos/mpn/copyd.s index 1b178c7c40..a3a3af68be 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/copyd.s +++ b/ext/gmp/gen/aarch64-macos/mpn/copyd.s @@ -43,6 +43,11 @@ + + + + + @@ -62,44 +67,31 @@ ___gmpn_copyd: tbz x0, #3, Lal2 - sub x1, x1, #8 - ld1 {v22.1d}, [x1] + ldr x4, [x1,#-8]! sub x2, x2, #1 - sub x0, x0, #8 - st1 {v22.1d}, [x0] + str x4, [x0,#-8]! -Lal2: sub x1, x1, #16 - ld1 {v26.2d}, [x1] +Lal2: ldp x4,x5, [x1,#-16]! sub x2, x2, #6 - sub x0, x0, #16 tbnz x2, #63, Lend - sub x1, x1, #16 - mov x12, #-16 - .align 4 -Ltop: ld1 {v22.2d}, [x1], x12 - st1 {v26.2d}, [x0], x12 - ld1 {v26.2d}, [x1], x12 - st1 {v22.2d}, [x0], x12 +Ltop: ldp x6,x7, [x1,#-16] + stp x4,x5, [x0,#-16] + ldp x4,x5, [x1,#-32]! + stp x6,x7, [x0,#-32]! sub x2, x2, #4 tbz x2, #63, Ltop - add x1, x1, #16 - -Lend: st1 {v26.2d}, [x0] +Lend: stp x4,x5, [x0,#-16]! Lbc: tbz x2, #1, Ltl1 - sub x1, x1, #16 - ld1 {v22.2d}, [x1] - sub x0, x0, #16 - st1 {v22.2d}, [x0] + ldp x4,x5, [x1,#-16]! + stp x4,x5, [x0,#-16]! Ltl1: tbz x2, #0, Ltl2 - sub x1, x1, #8 - ld1 {v22.1d}, [x1] - sub x0, x0, #8 - st1 {v22.1d}, [x0] + ldr x4, [x1,#-8] + str x4, [x0,#-8] Ltl2: ret diff --git a/ext/gmp/gen/aarch64-macos/mpn/copyi.s b/ext/gmp/gen/aarch64-macos/mpn/copyi.s index 95e54eaefd..b87f4fcc6f 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/copyi.s +++ b/ext/gmp/gen/aarch64-macos/mpn/copyi.s @@ -45,6 +45,10 @@ + + + + @@ -60,31 +64,31 @@ ___gmpn_copyi: tbz x0, #3, Lal2 - ld1 {v22.1d}, [x1], #8 + ldr x4, [x1],#8 sub x2, x2, #1 - st1 {v22.1d}, [x0], #8 + str x4, [x0],#8 -Lal2: ld1 {v26.2d}, [x1], #16 +Lal2: ldp x4,x5, [x1],#16 sub x2, x2, #6 tbnz x2, #63, Lend .align 4 -Ltop: ld1 {v22.2d}, [x1], #16 - st1 {v26.2d}, [x0], #16 - ld1 {v26.2d}, [x1], #16 - st1 {v22.2d}, [x0], #16 +Ltop: ldp x6,x7, [x1],#32 + stp x4,x5, [x0],#32 + ldp x4,x5, [x1,#-16] + stp x6,x7, [x0,#-16] sub x2, x2, #4 tbz x2, #63, Ltop -Lend: st1 {v26.2d}, [x0], #16 +Lend: stp x4,x5, [x0],#16 Lbc: tbz x2, #1, Ltl1 - ld1 {v22.2d}, [x1], #16 - st1 {v22.2d}, [x0], #16 + ldp x4,x5, [x1],#16 + stp x4,x5, [x0],#16 Ltl1: tbz x2, #0, Ltl2 - ld1 {v22.1d}, [x1] - st1 {v22.1d}, [x0] + ldr x4, [x1] + str x4, [x0] Ltl2: ret diff --git a/ext/gmp/gen/aarch64-macos/mpn/divrem_1.s b/ext/gmp/gen/aarch64-macos/mpn/divrem_1.s new file mode 100644 index 0000000000..3d4ca8c84f --- /dev/null +++ b/ext/gmp/gen/aarch64-macos/mpn/divrem_1.s @@ -0,0 +1,235 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 3 + .globl ___gmpn_preinv_divrem_1 + +___gmpn_preinv_divrem_1: + cbz x3, Lfz + stp x29, x30, [sp, #-80]! + mov x29, sp + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + + sub x21, x3, #1 + add x7, x21, x1 + add x20, x2, x21, lsl #3 + add x19, x0, x7, lsl #3 + mov x24, x1 + mov x22, x4 + mov x0, x5 + tbnz x4, #63, Lnentry + mov x23, x6 + b Luentry + + + .text + .align 3 + .globl ___gmpn_divrem_1 + +___gmpn_divrem_1: + cbz x3, Lfz + stp x29, x30, [sp, #-80]! + mov x29, sp + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + + sub x21, x3, #1 + add x7, x21, x1 + add x20, x2, x21, lsl #3 + add x19, x0, x7, lsl #3 + mov x24, x1 + mov x22, x4 + tbnz x4, #63, Lnormalised + +Lunnorm: + clz x23, x22 + lsl x0, x22, x23 + bl ___gmpn_invert_limb +Luentry: + lsl x22, x22, x23 + ldr x7, [x20], #-8 + sub x8, xzr, x23 + lsr x11, x7, x8 + lsl x1, x7, x23 + cbz x21, Luend + +Lutop:ldr x7, [x20], #-8 + add x2, x11, #1 + mul x10, x11, x0 + umulh x17, x11, x0 + lsr x9, x7, x8 + orr x1, x1, x9 + adds x10, x1, x10 + adc x2, x2, x17 + msub x11, x22, x2, x1 + lsl x1, x7, x23 + cmp x10, x11 + add x14, x11, x22 + csel x11, x14, x11, cc + sbc x2, x2, xzr + cmp x11, x22 + bcs Lufx +Luok: str x2, [x19], #-8 + sub x21, x21, #1 + cbnz x21, Lutop + +Luend:add x2, x11, #1 + mul x10, x11, x0 + umulh x17, x11, x0 + adds x10, x1, x10 + adc x2, x2, x17 + msub x11, x22, x2, x1 + cmp x10, x11 + add x14, x11, x22 + csel x11, x14, x11, cc + sbc x2, x2, xzr + subs x14, x11, x22 + adc x2, x2, xzr + csel x11, x14, x11, cs + str x2, [x19], #-8 + + cbnz x24, Lftop + lsr x0, x11, x23 + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x29, x30, [sp], #80 + ret + +Lufx: add x2, x2, #1 + sub x11, x11, x22 + b Luok + + +Lnormalised: + mov x0, x22 + bl ___gmpn_invert_limb +Lnentry: + ldr x7, [x20], #-8 + subs x14, x7, x22 + adc x2, xzr, xzr + csel x11, x14, x7, cs + b Lnok + +Lntop:ldr x1, [x20], #-8 + add x2, x11, #1 + mul x10, x11, x0 + umulh x17, x11, x0 + adds x10, x1, x10 + adc x2, x2, x17 + msub x11, x22, x2, x1 + cmp x10, x11 + add x14, x11, x22 + csel x11, x14, x11, cc + sbc x2, x2, xzr + cmp x11, x22 + bcs Lnfx +Lnok: str x2, [x19], #-8 + sub x21, x21, #1 + tbz x21, #63, Lntop + +Lnend:cbnz x24, Lfrac + mov x0, x11 + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x29, x30, [sp], #80 + ret + +Lnfx: add x2, x2, #1 + sub x11, x11, x22 + b Lnok + +Lfrac:mov x23, #0 +Lftop:add x2, x11, #1 + mul x10, x11, x0 + umulh x17, x11, x0 + add x2, x2, x17 + msub x11, x22, x2, xzr + cmp x10, x11 + add x14, x11, x22 + csel x11, x14, x11, cc + sbc x2, x2, xzr + str x2, [x19], #-8 + sub x24, x24, #1 + cbnz x24, Lftop + + lsr x0, x11, x23 + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x29, x30, [sp], #80 + ret + + +Lfz: cbz x1, Lzend +Lztop:str xzr, [x0], #8 + sub x1, x1, #1 + cbnz x1, Lztop +Lzend:mov x0, #0 + ret + diff --git a/ext/gmp/gen/aarch64-macos/mpn/ior_n.s b/ext/gmp/gen/aarch64-macos/mpn/ior_n.s index cfd315a938..4b4b643ece 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/ior_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/ior_n.s @@ -73,7 +73,7 @@ .globl ___gmpn_ior_n ___gmpn_ior_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -82,7 +82,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -93,7 +93,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -104,7 +104,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -117,8 +117,8 @@ Lmid: ldp x6, x7, [x1,#32]! orr x12, x4, x8 orr x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: orr x12, x6, x10 orr x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/iorn_n.s b/ext/gmp/gen/aarch64-macos/mpn/iorn_n.s index 94cc90a0ff..73d86e94a0 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/iorn_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/iorn_n.s @@ -73,7 +73,7 @@ .globl ___gmpn_iorn_n ___gmpn_iorn_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -82,7 +82,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -93,7 +93,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -104,7 +104,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -117,8 +117,8 @@ Lmid: ldp x6, x7, [x1,#32]! orn x12, x4, x8 orn x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: orn x12, x6, x10 orn x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/lshift.s b/ext/gmp/gen/aarch64-macos/mpn/lshift.s index 75dc0fbc9a..14e5a71bd2 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/lshift.s +++ b/ext/gmp/gen/aarch64-macos/mpn/lshift.s @@ -73,7 +73,7 @@ ___gmpn_lshift: add x16, x0, x2, lsl #3 add x1, x1, x2, lsl #3 sub x8, xzr, x3 - lsr x18, x2, #2 + lsr x17, x2, #2 tbz x2, #0, Lbx0 Lbx1: ldr x4, [x1,#-8] @@ -81,7 +81,7 @@ Lbx1: ldr x4, [x1,#-8] Lb01: lsr x0, x4, x8 lsl x2, x4, x3 - cbnz x18, Lgt1 + cbnz x17, Lgt1 str x2, [x16,#-8] ret Lgt1: ldp x4, x5, [x1,#-24] @@ -101,7 +101,7 @@ Lb10: lsr x0, x5, x8 lsl x13, x5, x3 lsr x10, x4, x8 lsl x2, x4, x3 - cbnz x18, Lgt2 + cbnz x17, Lgt2 orr x10, x10, x13 stp x2, x10, [x16,#-16] ret @@ -135,11 +135,11 @@ Llo2: lsr x10, x4, x8 orr x11, x12, x2 stp x10, x11, [x16,#-32]! lsl x2, x4, x3 -Llo0: sub x18, x18, #1 +Llo0: sub x17, x17, #1 Llo3: lsr x10, x6, x8 lsl x13, x7, x3 lsr x12, x7, x8 - cbnz x18, Ltop + cbnz x17, Ltop Lend: orr x10, x10, x13 orr x11, x12, x2 diff --git a/ext/gmp/gen/aarch64-macos/mpn/lshiftc.s b/ext/gmp/gen/aarch64-macos/mpn/lshiftc.s index c4e0b33084..438136f554 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/lshiftc.s +++ b/ext/gmp/gen/aarch64-macos/mpn/lshiftc.s @@ -73,7 +73,7 @@ ___gmpn_lshiftc: add x16, x0, x2, lsl #3 add x1, x1, x2, lsl #3 sub x8, xzr, x3 - lsr x18, x2, #2 + lsr x17, x2, #2 tbz x2, #0, Lbx0 Lbx1: ldr x4, [x1,#-8] @@ -81,7 +81,7 @@ Lbx1: ldr x4, [x1,#-8] Lb01: lsr x0, x4, x8 lsl x2, x4, x3 - cbnz x18, Lgt1 + cbnz x17, Lgt1 mvn x2, x2 str x2, [x16,#-8] ret @@ -102,7 +102,7 @@ Lb10: lsr x0, x5, x8 lsl x13, x5, x3 lsr x10, x4, x8 lsl x2, x4, x3 - cbnz x18, Lgt2 + cbnz x17, Lgt2 eon x10, x10, x13 mvn x2, x2 stp x2, x10, [x16,#-16] @@ -137,11 +137,11 @@ Llo2: lsr x10, x4, x8 eon x11, x12, x2 stp x10, x11, [x16,#-32]! lsl x2, x4, x3 -Llo0: sub x18, x18, #1 +Llo0: sub x17, x17, #1 Llo3: lsr x10, x6, x8 lsl x13, x7, x3 lsr x12, x7, x8 - cbnz x18, Ltop + cbnz x17, Ltop Lend: eon x10, x10, x13 eon x11, x12, x2 diff --git a/ext/gmp/gen/aarch64-macos/mpn/mul_1.s b/ext/gmp/gen/aarch64-macos/mpn/mul_1.s index 9a369b1627..7858152199 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/mul_1.s +++ b/ext/gmp/gen/aarch64-macos/mpn/mul_1.s @@ -54,6 +54,7 @@ + .text @@ -71,7 +72,7 @@ ___gmpn_mul_1c: ___gmpn_mul_1: adds x4, xzr, xzr -Lcom: lsr x18, x2, #2 +Lcom: lsr x17, x2, #2 tbnz x2, #0, Lbx1 Lbx0: mov x11, x4 @@ -80,7 +81,7 @@ Lbx0: mov x11, x4 Lb10: ldp x4, x5, [x1] mul x8, x4, x3 umulh x10, x4, x3 - cbz x18, L2 + cbz x17, L2 ldp x6, x7, [x1,#16]! mul x9, x5, x3 b Lmid-8 @@ -95,7 +96,7 @@ Lbx1: ldr x7, [x1],#8 str x9, [x0],#8 tbnz x2, #1, Lb10 -Lb01: cbz x18, L1 +Lb01: cbz x17, L1 Lb00: ldp x6, x7, [x1] mul x8, x6, x3 @@ -105,8 +106,8 @@ Lb00: ldp x6, x7, [x1] adcs x12, x8, x11 umulh x11, x7, x3 add x0, x0, #16 - sub x18, x18, #1 - cbz x18, Lend + sub x17, x17, #1 + cbz x17, Lend .align 4 Ltop: mul x8, x4, x3 @@ -125,8 +126,8 @@ Lmid: mul x8, x6, x3 stp x12, x13, [x0],#32 adcs x12, x8, x11 umulh x11, x7, x3 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: mul x8, x4, x3 adcs x13, x9, x10 diff --git a/ext/gmp/gen/aarch64-macos/mpn/nand_n.s b/ext/gmp/gen/aarch64-macos/mpn/nand_n.s index dd75975513..083703a2d8 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/nand_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/nand_n.s @@ -74,7 +74,7 @@ .globl ___gmpn_nand_n ___gmpn_nand_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -84,7 +84,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -95,7 +95,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -106,7 +106,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -123,8 +123,8 @@ Lmid: ldp x6, x7, [x1,#32]! mvn x12, x12 mvn x13, x13 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: and x12, x6, x10 and x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/nior_n.s b/ext/gmp/gen/aarch64-macos/mpn/nior_n.s index 03a2061d69..392a012812 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/nior_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/nior_n.s @@ -74,7 +74,7 @@ .globl ___gmpn_nior_n ___gmpn_nior_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -84,7 +84,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -95,7 +95,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -106,7 +106,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -123,8 +123,8 @@ Lmid: ldp x6, x7, [x1,#32]! mvn x12, x12 mvn x13, x13 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: orr x12, x6, x10 orr x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/rsblsh1_n.s b/ext/gmp/gen/aarch64-macos/mpn/rsblsh1_n.s index 40fa58cb1b..2d5cf138c0 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/rsblsh1_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/rsblsh1_n.s @@ -86,14 +86,14 @@ .globl ___gmpn_rsblsh1_n ___gmpn_rsblsh1_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x5, [x1] tbnz x3, #1, Lb11 Lb01: ldr x11, [x2] - cbz x18, L1 + cbz x6, L1 ldp x8, x9, [x2,#8] lsl x13, x11, #1 subs x15, x13, x5 @@ -115,7 +115,7 @@ Lb11: ldr x9, [x2] subs x17, x13, x5 str x17, [x0],#8 sub x1, x1, #8 - cbz x18, Lend + cbz x6, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -128,7 +128,7 @@ Lb00: subs x11, xzr, xzr Lb10: subs x9, xzr, xzr ldp x10, x11, [x2] sub x1, x1, #16 - cbz x18, Lend + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -145,8 +145,8 @@ Lmid: ldp x4, x5, [x1,#32]! sbcs x16, x12, x4 sbcs x17, x13, x5 stp x16, x17, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: ldp x4, x5, [x1,#16] extr x12, x10, x9, #63 diff --git a/ext/gmp/gen/aarch64-macos/mpn/rsblsh2_n.s b/ext/gmp/gen/aarch64-macos/mpn/rsblsh2_n.s index 91d6002042..1e79fe1257 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/rsblsh2_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/rsblsh2_n.s @@ -86,14 +86,14 @@ .globl ___gmpn_rsblsh2_n ___gmpn_rsblsh2_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x5, [x1] tbnz x3, #1, Lb11 Lb01: ldr x11, [x2] - cbz x18, L1 + cbz x6, L1 ldp x8, x9, [x2,#8] lsl x13, x11, #2 subs x15, x13, x5 @@ -115,7 +115,7 @@ Lb11: ldr x9, [x2] subs x17, x13, x5 str x17, [x0],#8 sub x1, x1, #8 - cbz x18, Lend + cbz x6, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -128,7 +128,7 @@ Lb00: subs x11, xzr, xzr Lb10: subs x9, xzr, xzr ldp x10, x11, [x2] sub x1, x1, #16 - cbz x18, Lend + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -145,8 +145,8 @@ Lmid: ldp x4, x5, [x1,#32]! sbcs x16, x12, x4 sbcs x17, x13, x5 stp x16, x17, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: ldp x4, x5, [x1,#16] extr x12, x10, x9, #62 diff --git a/ext/gmp/gen/aarch64-macos/mpn/rsh1add_n.s b/ext/gmp/gen/aarch64-macos/mpn/rsh1add_n.s index 3920913f18..0e46013bda 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/rsh1add_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/rsh1add_n.s @@ -66,7 +66,7 @@ .globl ___gmpn_rsh1add_n ___gmpn_rsh1add_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 @@ -76,7 +76,7 @@ Lbx1: ldr x5, [x1],#8 Lb01: adds x13, x5, x9 and x10, x13, #1 - cbz x18, L1 + cbz x6, L1 ldp x4, x5, [x1],#48 ldp x8, x9, [x2],#48 adcs x14, x4, x8 @@ -87,8 +87,8 @@ Lb01: adds x13, x5, x9 adcs x12, x4, x8 adcs x13, x5, x9 str x17, [x0], #24 - sub x18, x18, #1 - cbz x18, Lend + sub x6, x6, #1 + cbz x6, Lend b Ltop L1: cset x14, cs @@ -104,7 +104,7 @@ Lb11: adds x15, x5, x9 ldp x8, x9, [x2],#32 adcs x12, x4, x8 adcs x13, x5, x9 - cbz x18, L3 + cbz x6, L3 ldp x4, x5, [x1,#-16] ldp x8, x9, [x2,#-16] extr x17, x12, x15, #1 @@ -124,7 +124,7 @@ Lb10: ldp x4, x5, [x1],#32 adds x12, x4, x8 adcs x13, x5, x9 and x10, x12, #1 - cbz x18, L2 + cbz x6, L2 ldp x4, x5, [x1,#-16] ldp x8, x9, [x2,#-16] adcs x14, x4, x8 @@ -141,8 +141,8 @@ Lb00: ldp x4, x5, [x1],#48 adcs x12, x4, x8 adcs x13, x5, x9 add x0, x0, #16 - sub x18, x18, #1 - cbz x18, Lend + sub x6, x6, #1 + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#-16] @@ -159,8 +159,8 @@ Lmid: ldp x4, x5, [x1],#32 adcs x12, x4, x8 adcs x13, x5, x9 stp x16, x17, [x0],#32 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: extr x16, x15, x14, #1 extr x17, x12, x15, #1 diff --git a/ext/gmp/gen/aarch64-macos/mpn/rsh1sub_n.s b/ext/gmp/gen/aarch64-macos/mpn/rsh1sub_n.s index 745db9ec02..2600b60f32 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/rsh1sub_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/rsh1sub_n.s @@ -66,7 +66,7 @@ .globl ___gmpn_rsh1sub_n ___gmpn_rsh1sub_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 @@ -76,7 +76,7 @@ Lbx1: ldr x5, [x1],#8 Lb01: subs x13, x5, x9 and x10, x13, #1 - cbz x18, L1 + cbz x6, L1 ldp x4, x5, [x1],#48 ldp x8, x9, [x2],#48 sbcs x14, x4, x8 @@ -87,8 +87,8 @@ Lb01: subs x13, x5, x9 sbcs x12, x4, x8 sbcs x13, x5, x9 str x17, [x0], #24 - sub x18, x18, #1 - cbz x18, Lend + sub x6, x6, #1 + cbz x6, Lend b Ltop L1: cset x14, cc @@ -104,7 +104,7 @@ Lb11: subs x15, x5, x9 ldp x8, x9, [x2],#32 sbcs x12, x4, x8 sbcs x13, x5, x9 - cbz x18, L3 + cbz x6, L3 ldp x4, x5, [x1,#-16] ldp x8, x9, [x2,#-16] extr x17, x12, x15, #1 @@ -124,7 +124,7 @@ Lb10: ldp x4, x5, [x1],#32 subs x12, x4, x8 sbcs x13, x5, x9 and x10, x12, #1 - cbz x18, L2 + cbz x6, L2 ldp x4, x5, [x1,#-16] ldp x8, x9, [x2,#-16] sbcs x14, x4, x8 @@ -141,8 +141,8 @@ Lb00: ldp x4, x5, [x1],#48 sbcs x12, x4, x8 sbcs x13, x5, x9 add x0, x0, #16 - sub x18, x18, #1 - cbz x18, Lend + sub x6, x6, #1 + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#-16] @@ -159,8 +159,8 @@ Lmid: ldp x4, x5, [x1],#32 sbcs x12, x4, x8 sbcs x13, x5, x9 stp x16, x17, [x0],#32 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: extr x16, x15, x14, #1 extr x17, x12, x15, #1 diff --git a/ext/gmp/gen/aarch64-macos/mpn/rshift.s b/ext/gmp/gen/aarch64-macos/mpn/rshift.s index 472e5bc426..a27751987b 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/rshift.s +++ b/ext/gmp/gen/aarch64-macos/mpn/rshift.s @@ -72,7 +72,7 @@ ___gmpn_rshift: mov x16, x0 sub x8, xzr, x3 - lsr x18, x2, #2 + lsr x17, x2, #2 tbz x2, #0, Lbx0 Lbx1: ldr x5, [x1] @@ -80,7 +80,7 @@ Lbx1: ldr x5, [x1] Lb01: lsl x0, x5, x8 lsr x2, x5, x3 - cbnz x18, Lgt1 + cbnz x17, Lgt1 str x2, [x16] ret Lgt1: ldp x4, x5, [x1,#8] @@ -101,7 +101,7 @@ Lb10: lsl x0, x4, x8 lsr x13, x4, x3 lsl x10, x5, x8 lsr x2, x5, x3 - cbnz x18, Lgt2 + cbnz x17, Lgt2 orr x10, x10, x13 stp x10, x2, [x16] ret @@ -133,11 +133,11 @@ Llo2: lsl x10, x5, x8 orr x11, x12, x2 stp x11, x10, [x16,#32]! lsr x2, x5, x3 -Llo0: sub x18, x18, #1 +Llo0: sub x17, x17, #1 Llo3: lsl x10, x7, x8 lsl x12, x6, x8 lsr x13, x6, x3 - cbnz x18, Ltop + cbnz x17, Ltop Lend: orr x10, x10, x13 orr x11, x12, x2 diff --git a/ext/gmp/gen/aarch64-macos/mpn/sqr_diag_addlsh1.s b/ext/gmp/gen/aarch64-macos/mpn/sqr_diag_addlsh1.s index 0e01e2858a..0255158b72 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/sqr_diag_addlsh1.s +++ b/ext/gmp/gen/aarch64-macos/mpn/sqr_diag_addlsh1.s @@ -58,7 +58,7 @@ ___gmpn_sqr_diag_addlsh1: ldr x15, [x2],#8 - lsr x18, x3, #1 + lsr x14, x3, #1 tbz x3, #0, Lbx0 Lbx1: adds x7, xzr, xzr @@ -73,8 +73,8 @@ Lbx0: adds x5, xzr, xzr ldr x17, [x2],#16 ldp x6, x7, [x1],#32 umulh x11, x15, x15 - sub x18, x18, #1 - cbz x18, Lend + sub x14, x14, #1 + cbz x14, Lend .align 4 Ltop: extr x9, x6, x5, #63 @@ -95,8 +95,8 @@ Lmid: extr x9, x4, x7, #63 extr x8, x5, x4, #63 stp x12, x13, [x0],#16 adcs x12, x8, x10 - sub x18, x18, #1 - cbnz x18, Ltop + sub x14, x14, #1 + cbnz x14, Ltop Lend: extr x9, x6, x5, #63 mul x10, x17, x17 diff --git a/ext/gmp/gen/aarch64-macos/mpn/sub_n.s b/ext/gmp/gen/aarch64-macos/mpn/sub_n.s index 0ed940928d..3695521862 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/sub_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/sub_n.s @@ -77,7 +77,7 @@ ___gmpn_sub_nc: ___gmpn_sub_n: cmp xzr, xzr -Lent: lsr x18, x3, #2 +Lent: lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -86,7 +86,7 @@ Lbx1: ldr x7, [x1] str x13, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -97,7 +97,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -110,7 +110,7 @@ Lb00: ldp x4, x5, [x1] Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -123,8 +123,8 @@ Lmid: ldp x6, x7, [x1,#32]! sbcs x12, x4, x8 sbcs x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: sbcs x12, x6, x10 sbcs x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/sublsh1_n.s b/ext/gmp/gen/aarch64-macos/mpn/sublsh1_n.s index 7bc7204291..e3e924f379 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/sublsh1_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/sublsh1_n.s @@ -86,14 +86,14 @@ .globl ___gmpn_sublsh1_n ___gmpn_sublsh1_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x5, [x1] tbnz x3, #1, Lb11 Lb01: ldr x11, [x2] - cbz x18, L1 + cbz x6, L1 ldp x8, x9, [x2,#8] lsl x13, x11, #1 subs x15, x5, x13 @@ -115,7 +115,7 @@ Lb11: ldr x9, [x2] subs x17, x5, x13 str x17, [x0],#8 sub x1, x1, #8 - cbz x18, Lend + cbz x6, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -128,7 +128,7 @@ Lb00: subs x11, xzr, xzr Lb10: subs x9, xzr, xzr ldp x10, x11, [x2] sub x1, x1, #16 - cbz x18, Lend + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -145,8 +145,8 @@ Lmid: ldp x4, x5, [x1,#32]! sbcs x16, x4, x12 sbcs x17, x5, x13 stp x16, x17, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: ldp x4, x5, [x1,#16] extr x12, x10, x9, #63 diff --git a/ext/gmp/gen/aarch64-macos/mpn/sublsh2_n.s b/ext/gmp/gen/aarch64-macos/mpn/sublsh2_n.s index 3b37de4c79..0df8084a3b 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/sublsh2_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/sublsh2_n.s @@ -86,14 +86,14 @@ .globl ___gmpn_sublsh2_n ___gmpn_sublsh2_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x5, [x1] tbnz x3, #1, Lb11 Lb01: ldr x11, [x2] - cbz x18, L1 + cbz x6, L1 ldp x8, x9, [x2,#8] lsl x13, x11, #2 subs x15, x5, x13 @@ -115,7 +115,7 @@ Lb11: ldr x9, [x2] subs x17, x5, x13 str x17, [x0],#8 sub x1, x1, #8 - cbz x18, Lend + cbz x6, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -128,7 +128,7 @@ Lb00: subs x11, xzr, xzr Lb10: subs x9, xzr, xzr ldp x10, x11, [x2] sub x1, x1, #16 - cbz x18, Lend + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -145,8 +145,8 @@ Lmid: ldp x4, x5, [x1,#32]! sbcs x16, x4, x12 sbcs x17, x5, x13 stp x16, x17, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: ldp x4, x5, [x1,#16] extr x12, x10, x9, #62 diff --git a/ext/gmp/gen/aarch64-macos/mpn/submul_1.s b/ext/gmp/gen/aarch64-macos/mpn/submul_1.s index 439b82096b..11f80f4673 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/submul_1.s +++ b/ext/gmp/gen/aarch64-macos/mpn/submul_1.s @@ -54,6 +54,11 @@ + + + + + diff --git a/ext/gmp/gen/aarch64-macos/mpn/tmp-add_n.s b/ext/gmp/gen/aarch64-macos/mpn/tmp-add_n.s new file mode 100644 index 0000000000..5f2b539e98 --- /dev/null +++ b/ext/gmp/gen/aarch64-macos/mpn/tmp-add_n.s @@ -0,0 +1,134 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 3 + .globl ___gmpn_add_nc + +___gmpn_add_nc: + cmp x4, #1 + b Lent + + .text + .align 3 + .globl ___gmpn_add_n + +___gmpn_add_n: + cmn xzr, xzr +Lent: lsr x17, x3, #2 + tbz x3, #0, Lbx0 + +Lbx1: ldr x7, [x1] + ldr x11, [x2] + adcs x13, x7, x11 + str x13, [x0],#8 + tbnz x3, #1, Lb11 + +Lb01: cbz x17, Lret + ldp x4, x5, [x1,#8] + ldp x8, x9, [x2,#8] + sub x1, x1, #8 + sub x2, x2, #8 + b Lmid + +Lb11: ldp x6, x7, [x1,#8] + ldp x10, x11, [x2,#8] + add x1, x1, #8 + add x2, x2, #8 + cbz x17, Lend + b Ltop + +Lbx0: tbnz x3, #1, Lb10 + +Lb00: ldp x4, x5, [x1] + ldp x8, x9, [x2] + sub x1, x1, #16 + sub x2, x2, #16 + b Lmid + +Lb10: ldp x6, x7, [x1] + ldp x10, x11, [x2] + cbz x17, Lend + + .align 4 +Ltop: ldp x4, x5, [x1,#16] + ldp x8, x9, [x2,#16] + adcs x12, x6, x10 + adcs x13, x7, x11 + stp x12, x13, [x0],#16 +Lmid: ldp x6, x7, [x1,#32]! + ldp x10, x11, [x2,#32]! + adcs x12, x4, x8 + adcs x13, x5, x9 + stp x12, x13, [x0],#16 + sub x17, x17, #1 + cbnz x17, Ltop + +Lend: adcs x12, x6, x10 + adcs x13, x7, x11 + stp x12, x13, [x0] +Lret: cset x0, cs + ret + diff --git a/ext/gmp/gen/aarch64-macos/mpn/xnor_n.s b/ext/gmp/gen/aarch64-macos/mpn/xnor_n.s index 5ddbb4a9d2..345378e20c 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/xnor_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/xnor_n.s @@ -73,7 +73,7 @@ .globl ___gmpn_xnor_n ___gmpn_xnor_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -82,7 +82,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -93,7 +93,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -104,7 +104,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -117,8 +117,8 @@ Lmid: ldp x6, x7, [x1,#32]! eon x12, x4, x8 eon x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: eon x12, x6, x10 eon x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/xor_n.s b/ext/gmp/gen/aarch64-macos/mpn/xor_n.s index d696c28867..2b59a4b079 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/xor_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/xor_n.s @@ -73,7 +73,7 @@ .globl ___gmpn_xor_n ___gmpn_xor_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -82,7 +82,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -93,7 +93,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -104,7 +104,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -117,8 +117,8 @@ Lmid: ldp x6, x7, [x1,#32]! eor x12, x4, x8 eor x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: eor x12, x6, x10 eor x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/sieve_table.h b/ext/gmp/gen/aarch64-macos/sieve_table.h new file mode 100644 index 0000000000..ee9ac14360 --- /dev/null +++ b/ext/gmp/gen/aarch64-macos/sieve_table.h @@ -0,0 +1,46 @@ +/* This file generated by gen-sieve.c - DO NOT EDIT. */ + +#if GMP_LIMB_BITS != 64 +Error, error, this data is for 64 bits +#endif + +#define PRIMESIEVE_INIT_TABLE \ + CNST_LIMB (0x3294C9E069128480), /* 5 - 196 (42 primes) */ \ + CNST_LIMB (0x95A35E1EC4AB21DC), /* 197 - 388 (32 primes) */ \ + CNST_LIMB (0x4AD7CE99B8693366), /* 389 - 580 (30 primes) */ \ + CNST_LIMB (0x6595B6DA728DC52B), /* 581 - 772 (30 primes) */ \ + CNST_LIMB (0xEA6D9F8787B0CEDE), /* 773 - 964 (26 primes) */ \ + CNST_LIMB (0x3F56A1F4CD3275A9), /* 965 - 1156 (29 primes) */ \ + CNST_LIMB (0xFD3848FB74A76ADB), /* 1157 - 1348 (26 primes) */ \ + CNST_LIMB (0xDBBA0DD1A1EDF6AF), /* 1349 - 1540 (25 primes) */ \ + CNST_LIMB (0xCEC7F17ED22799A5), /* 1541 - 1732 (27 primes) */ \ + CNST_LIMB (0xEAEC17BDBB717D56), /* 1733 - 1924 (24 primes) */ \ + CNST_LIMB (0x3B0EB7B3585AFCF3), /* 1925 - 2116 (26 primes) */ \ + CNST_LIMB (0xE563D8F69FDF6C4F), /* 2117 - 2308 (23 primes) */ \ + CNST_LIMB (0xFE5BA7ABA45E92FC), /* 2309 - 2500 (25 primes) */ \ + CNST_LIMB (0x158DEE6F3BF49B7D), /* 2501 - 2692 (24 primes) */ \ + CNST_LIMB (0xBE5A7BC4EDE6CD1A), /* 2693 - 2884 (26 primes) */ \ + CNST_LIMB (0xD7679B3FCA7BB6AD), /* 2885 - 3076 (22 primes) */ \ + CNST_LIMB (0xC3F66B971FEF37E9), /* 3077 - 3268 (22 primes) */ \ + CNST_LIMB (0x6F7EBCF339C953FD), /* 3269 - 3460 (22 primes) */ \ + CNST_LIMB (0xD5A5ECDCD235DBF0), /* 3461 - 3652 (27 primes) */ \ + CNST_LIMB (0xECFA7B2FD5B65E3B), /* 3653 - 3844 (22 primes) */ \ + CNST_LIMB (0xD28EFDF9C89F67B1), /* 3845 - 4036 (25 primes) */ \ + CNST_LIMB (0xCB7F7C7A3DD3AF4F), /* 4037 - 4228 (21 primes) */ \ + CNST_LIMB (0xEEBED6CDFF6B32CC), /* 4229 - 4420 (22 primes) */ \ + CNST_LIMB (0xD5BD73F85ECFA97C), /* 4421 - 4612 (23 primes) */ \ + CNST_LIMB (0x21FDBE4FBBAD48F7), /* 4613 - 4804 (24 primes) */ \ + CNST_LIMB (0x5E35A3B5EEB7FDE7), /* 4805 - 4996 (21 primes) */ \ + CNST_LIMB (0xD9EBFD53A7DBBCC9), /* 4997 - 5188 (22 primes) */ \ + CNST_LIMB (0xFF9EDEAF2EFE1F76), /* 5189 - 5380 (18 primes) */ +#define PRIMESIEVE_NUMBEROF_TABLE 28 +/* #define PRIMESIEVE_PRIMES_IN_TABLE 706 */ +#define PRIMESIEVE_HIGHEST_PRIME 5351 +/* #define PRIMESIEVE_FIRST_UNCHECKED 5381 */ + +#define SIEVE_MASK1 CNST_LIMB(0x3204C1A049120485) +#define SIEVE_MASKT CNST_LIMB(0xA1204892058) +#define SIEVE_2MSK1 CNST_LIMB(0x29048402110840A) +#define SIEVE_2MSK2 CNST_LIMB(0x9402180C40230184) +#define SIEVE_2MSKT CNST_LIMB(0x5021088402120) + diff --git a/ext/gmp/gen/x86_64-linux/config.h b/ext/gmp/gen/x86_64-linux/config.h new file mode 100644 index 0000000000..47840ffb13 --- /dev/null +++ b/ext/gmp/gen/x86_64-linux/config.h @@ -0,0 +1,668 @@ +/* config.h. Generated from config.in by configure. */ +/* config.in. Generated from configure.ac by autoheader. */ + +/* + +Copyright 1996-2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. +*/ + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* The gmp-mparam.h file (a string) the tune program should suggest updating. + */ +#define GMP_MPARAM_H_SUGGEST "./mpn/x86_64/k8/gmp-mparam.h" + +/* Define to 1 if you have the `alarm' function. */ +#define HAVE_ALARM 1 + +/* Define to 1 if alloca() works (via gmp-impl.h). */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#define HAVE_ALLOCA_H 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((const)) */ +#define HAVE_ATTRIBUTE_CONST 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((malloc)) */ +#define HAVE_ATTRIBUTE_MALLOC 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((mode (XX))) + */ +#define HAVE_ATTRIBUTE_MODE 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((noreturn)) */ +#define HAVE_ATTRIBUTE_NORETURN 1 + +/* Define to 1 if you have the `attr_get' function. */ +/* #undef HAVE_ATTR_GET */ + +/* Define to 1 if tests/libtests has calling conventions checking for the CPU + */ +#define HAVE_CALLING_CONVENTIONS 1 + +/* Define to 1 if you have the `clock' function. */ +#define HAVE_CLOCK 1 + +/* Define to 1 if you have the `clock_gettime' function */ +#define HAVE_CLOCK_GETTIME 1 + +/* Define to 1 if you have the `cputime' function. */ +/* #undef HAVE_CPUTIME */ + +/* Define to 1 if you have the declaration of `fgetc', and to 0 if you don't. + */ +#define HAVE_DECL_FGETC 1 + +/* Define to 1 if you have the declaration of `fscanf', and to 0 if you don't. + */ +#define HAVE_DECL_FSCANF 1 + +/* Define to 1 if you have the declaration of `optarg', and to 0 if you don't. + */ +#define HAVE_DECL_OPTARG 1 + +/* Define to 1 if you have the declaration of `sys_errlist', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_ERRLIST 0 + +/* Define to 1 if you have the declaration of `sys_nerr', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_NERR 0 + +/* Define to 1 if you have the declaration of `ungetc', and to 0 if you don't. + */ +#define HAVE_DECL_UNGETC 1 + +/* Define to 1 if you have the declaration of `vfprintf', and to 0 if you + don't. */ +#define HAVE_DECL_VFPRINTF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define one of the following to 1 for the format of a `double'. + If your format is not among these choices, or you don't know what it is, + then leave all undefined. + IEEE_LITTLE_SWAPPED means little endian, but with the two 4-byte halves + swapped, as used by ARM CPUs in little endian mode. */ +/* #undef HAVE_DOUBLE_IEEE_BIG_ENDIAN */ +#define HAVE_DOUBLE_IEEE_LITTLE_ENDIAN 1 +/* #undef HAVE_DOUBLE_IEEE_LITTLE_SWAPPED */ +/* #undef HAVE_DOUBLE_VAX_D */ +/* #undef HAVE_DOUBLE_VAX_G */ +/* #undef HAVE_DOUBLE_CRAY_CFP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FLOAT_H 1 + +/* Define to 1 if you have the `getpagesize' function. */ +#define HAVE_GETPAGESIZE 1 + +/* Define to 1 if you have the `getrusage' function. */ +#define HAVE_GETRUSAGE 1 + +/* Define to 1 if you have the `getsysinfo' function. */ +/* #undef HAVE_GETSYSINFO */ + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((visibility)) + and __attribute__ ((alias)) */ +#define HAVE_HIDDEN_ALIAS 1 + +/* Define one of these to 1 for the host CPU family. + If your CPU is not in any of these families, leave all undefined. + For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */ +/* #undef HAVE_HOST_CPU_FAMILY_alpha */ +/* #undef HAVE_HOST_CPU_FAMILY_m68k */ +/* #undef HAVE_HOST_CPU_FAMILY_power */ +/* #undef HAVE_HOST_CPU_FAMILY_powerpc */ +/* #undef HAVE_HOST_CPU_FAMILY_x86 */ +#define HAVE_HOST_CPU_FAMILY_x86_64 1 + +/* Define one of the following to 1 for the host CPU, as per the output of + ./config.guess. If your CPU is not listed here, leave all undefined. */ +/* #undef HAVE_HOST_CPU_alphaev67 */ +/* #undef HAVE_HOST_CPU_alphaev68 */ +/* #undef HAVE_HOST_CPU_alphaev7 */ +/* #undef HAVE_HOST_CPU_m68020 */ +/* #undef HAVE_HOST_CPU_m68030 */ +/* #undef HAVE_HOST_CPU_m68040 */ +/* #undef HAVE_HOST_CPU_m68060 */ +/* #undef HAVE_HOST_CPU_m68360 */ +/* #undef HAVE_HOST_CPU_powerpc604 */ +/* #undef HAVE_HOST_CPU_powerpc604e */ +/* #undef HAVE_HOST_CPU_powerpc750 */ +/* #undef HAVE_HOST_CPU_powerpc7400 */ +/* #undef HAVE_HOST_CPU_supersparc */ +/* #undef HAVE_HOST_CPU_i386 */ +/* #undef HAVE_HOST_CPU_i586 */ +/* #undef HAVE_HOST_CPU_i686 */ +/* #undef HAVE_HOST_CPU_pentium */ +/* #undef HAVE_HOST_CPU_pentiummmx */ +/* #undef HAVE_HOST_CPU_pentiumpro */ +/* #undef HAVE_HOST_CPU_pentium2 */ +/* #undef HAVE_HOST_CPU_pentium3 */ +/* #undef HAVE_HOST_CPU_pentium4 */ +/* #undef HAVE_HOST_CPU_core2 */ +/* #undef HAVE_HOST_CPU_nehalem */ +/* #undef HAVE_HOST_CPU_westmere */ +/* #undef HAVE_HOST_CPU_sandybridge */ +/* #undef HAVE_HOST_CPU_ivybridge */ +/* #undef HAVE_HOST_CPU_haswell */ +/* #undef HAVE_HOST_CPU_broadwell */ +/* #undef HAVE_HOST_CPU_skylake */ +/* #undef HAVE_HOST_CPU_silvermont */ +/* #undef HAVE_HOST_CPU_goldmont */ +/* #undef HAVE_HOST_CPU_k8 */ +/* #undef HAVE_HOST_CPU_k10 */ +/* #undef HAVE_HOST_CPU_bulldozer */ +/* #undef HAVE_HOST_CPU_piledriver */ +/* #undef HAVE_HOST_CPU_steamroller */ +/* #undef HAVE_HOST_CPU_excavator */ +/* #undef HAVE_HOST_CPU_zen */ +/* #undef HAVE_HOST_CPU_bobcat */ +/* #undef HAVE_HOST_CPU_jaguar */ +/* #undef HAVE_HOST_CPU_s390_z900 */ +/* #undef HAVE_HOST_CPU_s390_z990 */ +/* #undef HAVE_HOST_CPU_s390_z9 */ +/* #undef HAVE_HOST_CPU_s390_z10 */ +/* #undef HAVE_HOST_CPU_s390_z196 */ + +/* Define to 1 iff we have a s390 with 64-bit registers. */ +/* #undef HAVE_HOST_CPU_s390_zarch */ + +/* Define to 1 if the system has the type `intmax_t'. */ +#define HAVE_INTMAX_T 1 + +/* Define to 1 if the system has the type `intptr_t'. */ +#define HAVE_INTPTR_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_INVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LANGINFO_H 1 + +/* Define one of these to 1 for the endianness of `mp_limb_t'. + If the endianness is not a simple big or little, or you don't know what + it is, then leave both undefined. */ +/* #undef HAVE_LIMB_BIG_ENDIAN */ +#define HAVE_LIMB_LITTLE_ENDIAN 1 + +/* Define to 1 if you have the `localeconv' function. */ +#define HAVE_LOCALECONV 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if the system has the type `long double'. */ +#define HAVE_LONG_DOUBLE 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_MACHINE_HAL_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memset' function. */ +#define HAVE_MEMSET 1 + +/* Define to 1 if you have the `mmap' function. */ +#define HAVE_MMAP 1 + +/* Define to 1 if you have the `mprotect' function. */ +#define HAVE_MPROTECT 1 + +/* Define to 1 each of the following for which a native (ie. CPU specific) + implementation of the corresponding routine exists. */ +#define HAVE_NATIVE_mpn_add_n 1 +/* #undef HAVE_NATIVE_mpn_add_n_sub_n */ +#define HAVE_NATIVE_mpn_add_nc 1 +#define HAVE_NATIVE_mpn_addaddmul_1msb0 1 +#define HAVE_NATIVE_mpn_addlsh1_n 1 +#define HAVE_NATIVE_mpn_addlsh2_n 1 +#define HAVE_NATIVE_mpn_addlsh_n 1 +/* #undef HAVE_NATIVE_mpn_addlsh1_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addmul_1c */ +#define HAVE_NATIVE_mpn_addmul_2 1 +/* #undef HAVE_NATIVE_mpn_addmul_3 */ +/* #undef HAVE_NATIVE_mpn_addmul_4 */ +/* #undef HAVE_NATIVE_mpn_addmul_5 */ +/* #undef HAVE_NATIVE_mpn_addmul_6 */ +/* #undef HAVE_NATIVE_mpn_addmul_7 */ +/* #undef HAVE_NATIVE_mpn_addmul_8 */ +/* #undef HAVE_NATIVE_mpn_addmul_2s */ +#define HAVE_NATIVE_mpn_and_n 1 +#define HAVE_NATIVE_mpn_andn_n 1 +#define HAVE_NATIVE_mpn_bdiv_dbm1c 1 +#define HAVE_NATIVE_mpn_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_pi1_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_cnd_add_n 1 +#define HAVE_NATIVE_mpn_cnd_sub_n 1 +#define HAVE_NATIVE_mpn_com 1 +#define HAVE_NATIVE_mpn_copyd 1 +#define HAVE_NATIVE_mpn_copyi 1 +#define HAVE_NATIVE_mpn_div_qr_1n_pi1 1 +/* #undef HAVE_NATIVE_mpn_div_qr_2 */ +#define HAVE_NATIVE_mpn_divexact_1 1 +/* #undef HAVE_NATIVE_mpn_divexact_by3c */ +#define HAVE_NATIVE_mpn_divrem_1 1 +/* #undef HAVE_NATIVE_mpn_divrem_1c */ +#define HAVE_NATIVE_mpn_divrem_2 1 +/* #undef HAVE_NATIVE_mpn_gcd_1 */ +#define HAVE_NATIVE_mpn_gcd_11 1 +#define HAVE_NATIVE_mpn_gcd_22 1 +#define HAVE_NATIVE_mpn_hamdist 1 +#define HAVE_NATIVE_mpn_invert_limb 1 +#define HAVE_NATIVE_mpn_ior_n 1 +#define HAVE_NATIVE_mpn_iorn_n 1 +#define HAVE_NATIVE_mpn_lshift 1 +#define HAVE_NATIVE_mpn_lshiftc 1 +/* #undef HAVE_NATIVE_mpn_lshsub_n */ +/* #undef HAVE_NATIVE_mpn_mod_1 */ +#define HAVE_NATIVE_mpn_mod_1_1p 1 +/* #undef HAVE_NATIVE_mpn_mod_1c */ +#define HAVE_NATIVE_mpn_mod_1s_2p 1 +#define HAVE_NATIVE_mpn_mod_1s_4p 1 +#define HAVE_NATIVE_mpn_mod_34lsub1 1 +#define HAVE_NATIVE_mpn_modexact_1_odd 1 +#define HAVE_NATIVE_mpn_modexact_1c_odd 1 +#define HAVE_NATIVE_mpn_mul_1 1 +#define HAVE_NATIVE_mpn_mul_1c 1 +#define HAVE_NATIVE_mpn_mul_2 1 +/* #undef HAVE_NATIVE_mpn_mul_3 */ +/* #undef HAVE_NATIVE_mpn_mul_4 */ +/* #undef HAVE_NATIVE_mpn_mul_5 */ +/* #undef HAVE_NATIVE_mpn_mul_6 */ +#define HAVE_NATIVE_mpn_mul_basecase 1 +#define HAVE_NATIVE_mpn_mullo_basecase 1 +#define HAVE_NATIVE_mpn_nand_n 1 +#define HAVE_NATIVE_mpn_nior_n 1 +#define HAVE_NATIVE_mpn_popcount 1 +#define HAVE_NATIVE_mpn_preinv_divrem_1 1 +/* #undef HAVE_NATIVE_mpn_preinv_mod_1 */ +#define HAVE_NATIVE_mpn_redc_1 1 +/* #undef HAVE_NATIVE_mpn_redc_2 */ +#define HAVE_NATIVE_mpn_rsblsh1_n 1 +#define HAVE_NATIVE_mpn_rsblsh2_n 1 +#define HAVE_NATIVE_mpn_rsblsh_n 1 +/* #undef HAVE_NATIVE_mpn_rsblsh1_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh2_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh_nc */ +#define HAVE_NATIVE_mpn_rsh1add_n 1 +#define HAVE_NATIVE_mpn_rsh1add_nc 1 +#define HAVE_NATIVE_mpn_rsh1sub_n 1 +#define HAVE_NATIVE_mpn_rsh1sub_nc 1 +#define HAVE_NATIVE_mpn_rshift 1 +/* #undef HAVE_NATIVE_mpn_sbpi1_bdiv_r */ +#define HAVE_NATIVE_mpn_sqr_basecase 1 +/* #undef HAVE_NATIVE_mpn_sqr_diagonal */ +#define HAVE_NATIVE_mpn_sqr_diag_addlsh1 1 +#define HAVE_NATIVE_mpn_sub_n 1 +#define HAVE_NATIVE_mpn_sub_nc 1 +#define HAVE_NATIVE_mpn_sublsh1_n 1 +/* #undef HAVE_NATIVE_mpn_sublsh2_n */ +/* #undef HAVE_NATIVE_mpn_sublsh_n */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_submul_1c */ +/* #undef HAVE_NATIVE_mpn_tabselect */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd_r */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm_r */ +#define HAVE_NATIVE_mpn_xor_n 1 +#define HAVE_NATIVE_mpn_xnor_n 1 + +/* Define to 1 if you have the `nl_langinfo' function. */ +#define HAVE_NL_LANGINFO 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_NL_TYPES_H 1 + +/* Define to 1 if you have the `obstack_vprintf' function. */ +/* #define HAVE_OBSTACK_VPRINTF 1 */ + +/* Define to 1 if you have the `popen' function. */ +#define HAVE_POPEN 1 + +/* Define to 1 if you have the `processor_info' function. */ +/* #undef HAVE_PROCESSOR_INFO */ + +/* Define to 1 if `struct pst_processor' exists and contains + `psp_iticksperclktick'. */ +/* #undef HAVE_PSP_ITICKSPERCLKTICK */ + +/* Define to 1 if you have the `pstat_getprocessor' function. */ +/* #undef HAVE_PSTAT_GETPROCESSOR */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if the system has the type `quad_t'. */ +#define HAVE_QUAD_T 1 + +/* Define to 1 if you have the `raise' function. */ +#define HAVE_RAISE 1 + +/* Define to 1 if you have the `read_real_time' function. */ +/* #undef HAVE_READ_REAL_TIME */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `sigaltstack' function. */ +#define HAVE_SIGALTSTACK 1 + +/* Define to 1 if you have the `sigstack' function. */ +#define HAVE_SIGSTACK 1 + +/* Tune directory speed_cyclecounter, undef=none, 1=32bits, 2=64bits) */ +#define HAVE_SPEED_CYCLECOUNTER 2 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SSTREAM */ + +/* Define to 1 if the system has the type `stack_t'. */ +#define HAVE_STACK_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if the system has the type `std::locale'. */ +/* #undef HAVE_STD__LOCALE */ + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strnlen' function. */ +#define HAVE_STRNLEN 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the `sysctl' function. */ +/* #undef HAVE_SYSCTL */ + +/* Define to 1 if you have the `sysctlbyname' function. */ +/* #undef HAVE_SYSCTLBYNAME */ + +/* Define to 1 if you have the `syssgi' function. */ +/* #undef HAVE_SYSSGI */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_ATTRIBUTES_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_IOGRAPH_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_MMAN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PROCESSOR_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PSTAT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSCTL_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSINFO_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSSGI_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSTEMCFG_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIMES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the `times' function. */ +#define HAVE_TIMES 1 + +/* Define to 1 if the system has the type `uint_least32_t'. */ +#define HAVE_UINT_LEAST32_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the `vsnprintf' function and it works properly. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 for Windos/64 */ +/* #undef HOST_DOS64 */ + +/* Assembler local label prefix */ +#define LSYM_PREFIX ".L" + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* Define to 1 to disable the use of inline assembly */ +/* #undef NO_ASM */ + +/* Name of package */ +#define PACKAGE "gmp" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "GNU MP" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "GNU MP 6.2.1" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "gmp" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "http://www.gnu.org/software/gmp/" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "6.2.1" + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* The size of `mp_limb_t', as computed by sizeof. */ +#define SIZEOF_MP_LIMB_T 8 + +/* The size of `unsigned', as computed by sizeof. */ +#define SIZEOF_UNSIGNED 4 + +/* The size of `unsigned long', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_LONG 8 + +/* The size of `unsigned short', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_SHORT 2 + +/* The size of `void *', as computed by sizeof. */ +#define SIZEOF_VOID_P 8 + +/* Define to 1 if sscanf requires writable inputs */ +/* #undef SSCANF_WRITABLE_INPUT */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Maximum size the tune program can test for SQR_TOOM2_THRESHOLD */ +/* #undef TUNE_SQR_TOOM2_MAX */ + +/* Version number of package */ +#define VERSION "6.2.1" + +/* Define to 1 to enable ASSERT checking, per --enable-assert */ +/* #undef WANT_ASSERT */ + +/* Define to 1 to enable GMP_CPU_TYPE faking cpuid, per --enable-fake-cpuid */ +/* #undef WANT_FAKE_CPUID */ + +/* Define to 1 when building a fat binary. */ +/* #undef WANT_FAT_BINARY */ + +/* Define to 1 to enable FFTs for multiplication, per --enable-fft */ +#define WANT_FFT 1 + +/* Define to 1 to enable old mpn_mul_fft_full for multiplication, per + --enable-old-fft-full */ +/* #undef WANT_OLD_FFT_FULL */ + +/* Define to 1 if --enable-profiling=gprof */ +/* #undef WANT_PROFILING_GPROF */ + +/* Define to 1 if --enable-profiling=instrument */ +/* #undef WANT_PROFILING_INSTRUMENT */ + +/* Define to 1 if --enable-profiling=prof */ +/* #undef WANT_PROFILING_PROF */ + +/* Define one of these to 1 for the desired temporary memory allocation + method, per --enable-alloca. */ +#define WANT_TMP_ALLOCA 1 +/* #undef WANT_TMP_REENTRANT */ +/* #undef WANT_TMP_NOTREENTRANT */ +/* #undef WANT_TMP_DEBUG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to 1 if the assembler understands the mulx instruction */ +/* #undef X86_ASM_MULX */ + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +/* #undef YYTEXT_POINTER */ + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to the equivalent of the C99 'restrict' keyword, or to + nothing if this is not supported. Do not define if restrict is + supported directly. */ +#define restrict __restrict +/* Work around a bug in Sun C++: it does not support _Restrict or + __restrict__, even though the corresponding Sun C compiler ends up with + "#define restrict _Restrict" or "#define restrict __restrict__" in the + previous line. Perhaps some future version of Sun C++ will work with + restrict; if so, hopefully it defines __RESTRICT like Sun C does. */ +#if defined __SUNPRO_CC && !defined __RESTRICT +# define _Restrict +# define __restrict__ +#endif + +/* Define to empty if the keyword `volatile' does not work. Warning: valid + code using `volatile' can become incorrect without. Disable with care. */ +/* #undef volatile */ diff --git a/ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s b/ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s index 6c2ae338b4..2cbba6ad10 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s @@ -189,20 +189,20 @@ __gmpn_add_err1_n: .align 32, 0x90 .Lloop: - mov (%rsi,%r9,8), %r14 shr $1, %al mov -8(%r8), %r10 mov $0, %r13d + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 adc (%rdx,%r9,8), %r14 cmovnc %r13, %r10 - mov %r14, (%rdi,%r9,8) - mov 8(%rsi,%r9,8), %r15 - mov 16(%rsi,%r9,8), %r14 adc 8(%rdx,%r9,8), %r15 mov -16(%r8), %r11 + mov %r14, (%rdi,%r9,8) + mov 16(%rsi,%r9,8), %r14 + mov %r15, 8(%rdi,%r9,8) cmovnc %r13, %r11 mov -24(%r8), %r12 - mov %r15, 8(%rdi,%r9,8) adc 16(%rdx,%r9,8), %r14 cmovnc %r13, %r12 mov 24(%rsi,%r9,8), %r15 @@ -215,12 +215,12 @@ __gmpn_add_err1_n: adc $0, %rbp add %r12, %rbx adc $0, %rbp - lea -32(%r8), %r8 mov %r14, 16(%rdi,%r9,8) add %r13, %rbx + lea -32(%r8), %r8 adc $0, %rbp + mov %r15, 24(%rdi,%r9,8) add $4, %r9 - mov %r15, -8(%rdi,%r9,8) jnz .Lloop .Lend: diff --git a/ext/gmp/gen/x86_64-linux/mpn/add_n.s b/ext/gmp/gen/x86_64-linux/mpn/add_n.s index 400fe976ec..14cc32b0b9 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/add_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/add_n.s @@ -94,20 +94,18 @@ __gmpn_add_nc: - mov %ecx, %eax - shr $3, %rcx - and $7, %eax - - lea .Ltab(%rip), %r9 - neg %r8 + shr $2, %rcx + and $3, %eax + bt $0, %r8 + jrcxz .Llt4 - movslq (%r9,%rax,4), %rax - lea (%r9,%rax), %rax - jmp *%rax + mov (%rsi), %r8 + mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid .size __gmpn_add_nc,.-__gmpn_add_nc - .align 16, 0x90 .globl __gmpn_add_n .type __gmpn_add_n,@function @@ -115,159 +113,82 @@ __gmpn_add_nc: __gmpn_add_n: - mov %ecx, %eax - shr $3, %rcx - and $7, %eax - - lea .Ltab(%rip), %r9 + shr $2, %rcx + and $3, %eax + jrcxz .Llt4 - movslq (%r9,%rax,4), %rax - lea (%r9,%rax), %rax - jmp *%rax - - -.L0: mov (%rsi), %r8 + mov (%rsi), %r8 mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid + +.Llt4: dec %eax + mov (%rsi), %r8 + jnz .L2 adc (%rdx), %r8 - jmp .Le0 + mov %r8, (%rdi) + adc %eax, %eax + + ret -.L4: mov (%rsi), %r8 +.L2: dec %eax mov 8(%rsi), %r9 + jnz .L3 adc (%rdx), %r8 - lea -32(%rsi), %rsi - lea -32(%rdx), %rdx - lea -32(%rdi), %rdi - inc %rcx - jmp .Le4 - -.L5: mov (%rsi), %r11 - mov 8(%rsi), %r8 - mov 16(%rsi), %r9 - adc (%rdx), %r11 - lea -24(%rsi), %rsi - lea -24(%rdx), %rdx - lea -24(%rdi), %rdi - inc %rcx - jmp .Le5 - -.L6: mov (%rsi), %r10 - adc (%rdx), %r10 - mov 8(%rsi), %r11 - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi - inc %rcx - jmp .Le6 - -.L7: mov (%rsi), %r9 - mov 8(%rsi), %r10 - adc (%rdx), %r9 - adc 8(%rdx), %r10 - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi - inc %rcx - jmp .Le7 + adc 8(%rdx), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + adc %eax, %eax + + ret - .align 16, 0x90 -.Ltop: -.Le3: mov %r9, 40(%rdi) -.Le2: mov %r10, 48(%rdi) -.Le1: mov (%rsi), %r8 - mov 8(%rsi), %r9 +.L3: mov 16(%rsi), %r10 adc (%rdx), %r8 - mov %r11, 56(%rdi) - lea 64(%rdi), %rdi -.Le0: mov 16(%rsi), %r10 adc 8(%rdx), %r9 adc 16(%rdx), %r10 mov %r8, (%rdi) -.Le7: mov 24(%rsi), %r11 mov %r9, 8(%rdi) -.Le6: mov 32(%rsi), %r8 - mov 40(%rsi), %r9 - adc 24(%rdx), %r11 mov %r10, 16(%rdi) -.Le5: adc 32(%rdx), %r8 - mov %r11, 24(%rdi) -.Le4: mov 48(%rsi), %r10 - mov 56(%rsi), %r11 - mov %r8, 32(%rdi) - lea 64(%rsi), %rsi - adc 40(%rdx), %r9 - adc 48(%rdx), %r10 - adc 56(%rdx), %r11 - lea 64(%rdx), %rdx - dec %rcx - jnz .Ltop - -.Lend: mov %r9, 40(%rdi) - mov %r10, 48(%rdi) - mov %r11, 56(%rdi) - mov %ecx, %eax - adc %ecx, %eax + setc %al ret .align 16, 0x90 -.L3: mov (%rsi), %r9 - mov 8(%rsi), %r10 - mov 16(%rsi), %r11 - adc (%rdx), %r9 - adc 8(%rdx), %r10 - adc 16(%rdx), %r11 - jrcxz .Lx3 - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea -40(%rdi), %rdi - jmp .Le3 -.Lx3: mov %r9, (%rdi) - mov %r10, 8(%rdi) - mov %r11, 16(%rdi) - mov %ecx, %eax - adc %ecx, %eax - - ret +.Ltop: adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + mov %r8, (%rdi) + lea 32(%rsi), %rsi + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + dec %rcx + mov %r11, 24(%rdi) + lea 32(%rdx), %rdx + mov (%rsi), %r8 + mov 8(%rsi), %r9 + lea 32(%rdi), %rdi +.Lmid: mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + jnz .Ltop - .align 16, 0x90 -.L1: mov (%rsi), %r11 - adc (%rdx), %r11 - jrcxz .Lx1 - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea -56(%rdi), %rdi - jmp .Le1 -.Lx1: mov %r11, (%rdi) - mov %ecx, %eax - adc %ecx, %eax - - ret +.Lend: lea 32(%rsi), %rsi + adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + lea 32(%rdx), %rdx + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + mov %r11, 24(%rdi) + lea 32(%rdi), %rdi - .align 16, 0x90 -.L2: mov (%rsi), %r10 - mov 8(%rsi), %r11 - adc (%rdx), %r10 - adc 8(%rdx), %r11 - jrcxz .Lx2 - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea -48(%rdi), %rdi - jmp .Le2 -.Lx2: mov %r10, (%rdi) - mov %r11, 8(%rdi) - mov %ecx, %eax - adc %ecx, %eax + inc %eax + dec %eax + jnz .Llt4 + adc %eax, %eax ret .size __gmpn_add_n,.-__gmpn_add_n - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab - .long .L4-.Ltab - .long .L5-.Ltab - .long .L6-.Ltab - .long .L7-.Ltab diff --git a/ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s b/ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s index cac8dd4b70..e3d3aae6c0 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s @@ -46,15 +46,6 @@ - - - - - - - - - @@ -77,6 +68,9 @@ + + + @@ -86,29 +80,6 @@ .text - .align 16, 0x90 - .globl __gmpn_addlsh1_nc - .type __gmpn_addlsh1_nc,@function - -__gmpn_addlsh1_nc: - - - - push %rbp - mov %r8, %rax - neg %rax - xor %ebp, %ebp - mov (%rdx), %r8 - shrd $63, %r8, %rbp - mov %ecx, %r9d - and $3, %r9d - je .Lb00 - cmp $2, %r9d - jc .Lb01 - je .Lb10 - jmp .Lb11 - .size __gmpn_addlsh1_nc,.-__gmpn_addlsh1_nc - .align 16, 0x90 .globl __gmpn_addlsh1_n .type __gmpn_addlsh1_n,@function @@ -117,96 +88,92 @@ __gmpn_addlsh1_n: push %rbp - xor %ebp, %ebp + mov (%rdx), %r8 - shrd $63, %r8, %rbp mov %ecx, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + xor %ebp, %ebp and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - mov 16(%rdx), %r10 - shrd $63, %r10, %r9 - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - adc 16(%rsi), %r9 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, %rbp - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi +.Lb11: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 sbb %eax, %eax - sub $3, %rcx - ja .Ltop - jmp .Lend - -.Lb01: add %eax, %eax - adc (%rsi), %rbp - mov %rbp, (%rdi) - mov %r8, %rbp - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi + add (%rsi,%rcx,8), %r8 + adc 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + adc 16(%rsi,%rcx,8), %r10 + mov %r10, 16(%rdi,%rcx,8) + sbb %ebp, %ebp + add $3, %rcx + jmp .Lent + +.Lb10: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 sbb %eax, %eax - sub $1, %rcx - ja .Ltop - jmp .Lend - -.Lb10: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, %rbp - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi + add (%rsi,%rcx,8), %r8 + adc 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb %ebp, %ebp + add $2, %rcx + jmp .Lent + +.Lb01: add %r8, %r8 sbb %eax, %eax - sub $2, %rcx - ja .Ltop - jmp .Lend + add (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + sbb %ebp, %ebp + inc %rcx +.Lent: jns .Lend .align 16, 0x90 -.Ltop: mov (%rdx), %r8 - shrd $63, %r8, %rbp -.Lb00: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - mov 16(%rdx), %r10 - shrd $63, %r10, %r9 - mov 24(%rdx), %r11 - shrd $63, %r11, %r10 - lea 32(%rdx), %rdx - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - adc 16(%rsi), %r9 - adc 24(%rsi), %r10 - lea 32(%rsi), %rsi - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, 24(%rdi) - mov %r11, %rbp - lea 32(%rdi), %rdi +.Ltop: add %eax, %eax + + mov (%rdx,%rcx,8), %r8 +.Lb00: adc %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 + mov 24(%rdx,%rcx,8), %r11 + adc %r11, %r11 + sbb %eax, %eax - sub $4, %rcx - jnz .Ltop + add %ebp, %ebp + + adc (%rsi,%rcx,8), %r8 + nop + adc 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + adc 16(%rsi,%rcx,8), %r10 + adc 24(%rsi,%rcx,8), %r11 + mov %r10, 16(%rdi,%rcx,8) + mov %r11, 24(%rdi,%rcx,8) + + sbb %ebp, %ebp + add $4, %rcx + js .Ltop + +.Lend: + + add %ebp, %eax + neg %eax + -.Lend: shr $63, %rbp - add %eax, %eax - adc $0, %rbp - mov %rbp, %rax pop %rbp ret .size __gmpn_addlsh1_n,.-__gmpn_addlsh1_n - diff --git a/ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s b/ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s index 313daa83e2..00e20905cc 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s @@ -46,10 +46,13 @@ - - - - + + + + + + + @@ -87,30 +90,11 @@ - .text - .align 16, 0x90 - .globl __gmpn_addlsh2_nc - .type __gmpn_addlsh2_nc,@function - -__gmpn_addlsh2_nc: - - push %rbp - mov %r8, %rax - neg %rax - xor %ebp, %ebp - mov (%rdx), %r8 - shrd $62, %r8, %rbp - mov %ecx, %r9d - and $3, %r9d - je .Lb00 - cmp $2, %r9d - jc .Lb01 - je .Lb10 - jmp .Lb11 - .size __gmpn_addlsh2_nc,.-__gmpn_addlsh2_nc + + .text .align 16, 0x90 .globl __gmpn_addlsh2_n .type __gmpn_addlsh2_n,@function @@ -118,96 +102,102 @@ __gmpn_addlsh2_nc: __gmpn_addlsh2_n: - push %rbp - xor %ebp, %ebp + push %r12 + push %r13 + push %r14 + push %r15 + mov (%rdx), %r8 - shrd $62, %r8, %rbp + lea (,%r8,4), %r12 + shr $62, %r8 + mov %ecx, %eax - and $3, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + and $3, %al je .Lb00 - cmp $2, %eax + cmp $2, %al jc .Lb01 je .Lb10 -.Lb11: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - mov 16(%rdx), %r10 - shrd $62, %r10, %r9 - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - adc 16(%rsi), %r9 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, %rbp - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi - sbb %eax, %eax - sub $3, %rcx - ja .Ltop +.Lb11: mov 8(%rdx,%rcx,8), %r10 + lea (%r8,%r10,4), %r14 + shr $62, %r10 + mov 16(%rdx,%rcx,8), %r11 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + add (%rsi,%rcx,8), %r12 + adc 8(%rsi,%rcx,8), %r14 + adc 16(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r14, 8(%rdi,%rcx,8) + mov %r15, 16(%rdi,%rcx,8) + add $3, %rcx + js .Ltop jmp .Lend -.Lb01: add %eax, %eax - adc (%rsi), %rbp - mov %rbp, (%rdi) - mov %r8, %rbp - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - sbb %eax, %eax - sub $1, %rcx - ja .Ltop +.Lb01: mov %r8, %r11 + add (%rsi,%rcx,8), %r12 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + add $1, %rcx + js .Ltop jmp .Lend -.Lb10: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, %rbp - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi - sbb %eax, %eax - sub $2, %rcx - ja .Ltop +.Lb10: mov 8(%rdx,%rcx,8), %r11 + lea (%r8,%r11,4), %r15 + shr $62, %r11 + add (%rsi,%rcx,8), %r12 + adc 8(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r15, 8(%rdi,%rcx,8) + add $2, %rcx + js .Ltop jmp .Lend +.Lb00: mov 8(%rdx,%rcx,8), %r9 + mov 16(%rdx,%rcx,8), %r10 + jmp .Le00 + .align 16, 0x90 -.Ltop: mov (%rdx), %r8 - shrd $62, %r8, %rbp -.Lb00: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - mov 16(%rdx), %r10 - shrd $62, %r10, %r9 - mov 24(%rdx), %r11 - shrd $62, %r11, %r10 - lea 32(%rdx), %rdx - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - adc 16(%rsi), %r9 - adc 24(%rsi), %r10 - lea 32(%rsi), %rsi - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, 24(%rdi) - mov %r11, %rbp - lea 32(%rdi), %rdi - sbb %eax, %eax - sub $4, %rcx - jnz .Ltop - -.Lend: shr $62, %rbp - add %eax, %eax - adc $0, %rbp - mov %rbp, %rax - pop %rbp +.Ltop: mov 16(%rdx,%rcx,8), %r10 + mov (%rdx,%rcx,8), %r8 + mov 8(%rdx,%rcx,8), %r9 + lea (%r11,%r8,4), %r12 + shr $62, %r8 +.Le00: lea (%r8,%r9,4), %r13 + shr $62, %r9 + mov 24(%rdx,%rcx,8), %r11 + lea (%r9,%r10,4), %r14 + shr $62, %r10 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + add %eax, %eax + adc (%rsi,%rcx,8), %r12 + adc 8(%rsi,%rcx,8), %r13 + adc 16(%rsi,%rcx,8), %r14 + adc 24(%rsi,%rcx,8), %r15 + mov %r12, (%rdi,%rcx,8) + mov %r13, 8(%rdi,%rcx,8) + mov %r14, 16(%rdi,%rcx,8) + sbb %eax, %eax + mov %r15, 24(%rdi,%rcx,8) + add $4, %rcx + js .Ltop +.Lend: + + + sub %r11d, %eax + neg %eax + + pop %r15 + pop %r14 + pop %r13 + pop %r12 ret .size __gmpn_addlsh2_n,.-__gmpn_addlsh2_n diff --git a/ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s b/ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s index 00e16c8d00..2d261d5e37 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s @@ -65,32 +65,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - + @@ -103,7 +78,7 @@ .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_addlsh_n .type __gmpn_addlsh_n,@function @@ -111,142 +86,143 @@ __gmpn_addlsh_n: + push %r12 + push %rbp + push %rbx + + mov (%rdx), %rax + + mov $0, %ebp + sub %rcx, %rbp + + lea -16(%rsi,%rcx,8), %rsi + lea -16(%rdi,%rcx,8), %rdi + lea 16(%rdx,%rcx,8), %r12 + + mov %rcx, %r9 + + mov %r8, %rcx + mov $1, %r8d + shl %cl, %r8 + + mul %r8 + + and $3, %r9d + jz .Lb0 + cmp $2, %r9d + jc .Lb1 + jz .Lb2 + +.Lb3: mov %rax, %r11 + add 16(%rsi,%rbp,8), %r11 + mov -8(%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov (%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $3, %rbp + jnz .Llo3 + jmp .Lcj3 + +.Lb2: mov %rax, %rbx + mov -8(%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $2, %rbp + jz .Lcj2 + mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + xor %ecx, %ecx + jmp .Llo2 + +.Lb1: mov %rax, %r9 + mov %rdx, %r10 + add $1, %rbp + jnz .Lgt1 + add 8(%rsi,%rbp,8), %r9 + jmp .Lcj1 +.Lgt1: mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add 8(%rsi,%rbp,8), %r9 + adc 16(%rsi,%rbp,8), %r10 + adc 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + jmp .Llo1 + +.Lb0: mov %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add 16(%rsi,%rbp,8), %r10 + adc 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jz .Lend - mov (%rdx), %r10 - - mov %ecx, %eax - shr $3, %rcx - xor %r9d, %r9d - sub %r8, %r9 - and $7, %eax - - lea .Ltab(%rip), %r11 - - movslq (%r11,%rax,4), %rax - add %r11, %rax - jmp *%rax - - -.L0: lea 32(%rsi), %rsi - lea 32(%rdx), %rdx - lea 32(%rdi), %rdi - xor %r11d, %r11d - jmp .Le0 - -.L7: mov %r10, %r11 - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi - xor %r10d, %r10d - jmp .Le7 - -.L6: lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi - xor %r11d, %r11d - jmp .Le6 - -.L5: mov %r10, %r11 - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - xor %r10d, %r10d - jmp .Le5 - -.Lend: adc 24(%rsi), %rax - mov %rax, -40(%rdi) - .byte 0xc4,194,179,0xf7,195 - adc %rcx, %rax + .align 8, 0x90 +.Ltop: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Llo3: mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %r11, -8(%rdi,%rbp,8) +.Llo2: mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add %ecx, %ecx + adc (%rsi,%rbp,8), %rbx + adc 8(%rsi,%rbp,8), %r9 + adc 16(%rsi,%rbp,8), %r10 + adc 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rbx, (%rdi,%rbp,8) +.Llo1: mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov %r9, 8(%rdi,%rbp,8) +.Llo0: mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jnz .Ltop + +.Lend: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Lcj3: mov %r11, -8(%rdi,%rbp,8) +.Lcj2: add %ecx, %ecx + adc (%rsi,%rbp,8), %rbx + adc 8(%rsi,%rbp,8), %r9 + mov %rbx, (%rdi,%rbp,8) +.Lcj1: mov %r9, 8(%rdi,%rbp,8) + mov %rdx, %rax + adc $0, %rax + pop %rbx + pop %rbp + pop %r12 ret - - .align 32, 0x90 -.Ltop: jrcxz .Lend - mov -32(%rdx), %r10 - adc 24(%rsi), %rax - lea 64(%rsi), %rsi - .byte 0xc4,66,179,0xf7,219 - mov %rax, -40(%rdi) -.Le0: dec %rcx - .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov -24(%rdx), %r11 - adc -32(%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, -32(%rdi) -.Le7: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov -16(%rdx), %r10 - adc -24(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, -24(%rdi) -.Le6: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov -8(%rdx), %r11 - adc -16(%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, -16(%rdi) -.Le5: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov (%rdx), %r10 - adc -8(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, -8(%rdi) -.Le4: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov 8(%rdx), %r11 - adc (%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, (%rdi) -.Le3: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov 16(%rdx), %r10 - adc 8(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, 8(%rdi) -.Le2: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov 24(%rdx), %r11 - adc 16(%rsi), %rax - lea 64(%rdx), %rdx - .byte 0xc4,66,179,0xf7,210 - mov %rax, 16(%rdi) - lea 64(%rdi), %rdi -.Le1: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - jmp .Ltop - -.L4: xor %r11d, %r11d - jmp .Le4 - -.L3: mov %r10, %r11 - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi - xor %r10d, %r10d - jmp .Le3 - -.L2: lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi - xor %r11d, %r11d - jmp .Le2 - -.L1: mov %r10, %r11 - lea -24(%rsi), %rsi - lea 40(%rdx), %rdx - lea 40(%rdi), %rdi - xor %r10d, %r10d - jmp .Le1 .size __gmpn_addlsh_n,.-__gmpn_addlsh_n - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab - .long .L4-.Ltab - .long .L5-.Ltab - .long .L6-.Ltab - .long .L7-.Ltab - diff --git a/ext/gmp/gen/x86_64-linux/mpn/addmul_1.s b/ext/gmp/gen/x86_64-linux/mpn/addmul_1.s index 715dc68504..8daf1ac3cd 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/addmul_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/addmul_1.s @@ -67,8 +67,6 @@ - - @@ -77,136 +75,122 @@ + + + + + + + + .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_addmul_1 .type __gmpn_addmul_1,@function __gmpn_addmul_1: - - mov %rcx, %r10 - mov %rdx, %rcx - mov %edx, %r8d - shr $3, %rcx - and $7, %r8d - mov %r10, %rdx - lea .Ltab(%rip), %r10 - movslq (%r10,%r8,4), %r8 - lea (%r8, %r10), %r10 - jmp *%r10 - - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .Lf0-.Ltab - .long .Lf1-.Ltab - .long .Lf2-.Ltab - .long .Lf3-.Ltab - .long .Lf4-.Ltab - .long .Lf5-.Ltab - .long .Lf6-.Ltab - .long .Lf7-.Ltab - .text -.Lf0: .byte 0xc4,98,171,0xf6,6 - lea -8(%rsi), %rsi - lea -8(%rdi), %rdi - lea -1(%rcx), %rcx - jmp .Lb0 - -.Lf3: .byte 0xc4,226,179,0xf6,6 - lea 16(%rsi), %rsi - lea -48(%rdi), %rdi - jmp .Lb3 - -.Lf4: .byte 0xc4,98,171,0xf6,6 - lea 24(%rsi), %rsi - lea -40(%rdi), %rdi - jmp .Lb4 - -.Lf5: .byte 0xc4,226,179,0xf6,6 - lea 32(%rsi), %rsi - lea -32(%rdi), %rdi - jmp .Lb5 - -.Lf6: .byte 0xc4,98,171,0xf6,6 - lea 40(%rsi), %rsi - lea -24(%rdi), %rdi - jmp .Lb6 - -.Lf1: .byte 0xc4,226,179,0xf6,6 - jrcxz .L1 - jmp .Lb1 -.L1: add (%rdi), %r9 - mov %r9, (%rdi) - adc %rcx, %rax - - ret -.Lend: .byte 0xf3,76,0x0f,0x38,0xf6,15 - mov %r9, (%rdi) - .byte 0xf3,72,0x0f,0x38,0xf6,193 - adc %rcx, %rax - - ret - nop;nop;nop;nop - -.Lf2: .byte 0xc4,98,171,0xf6,6 - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - .byte 0xc4,226,179,0xf6,6 - - .align 32, 0x90 -.Ltop: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, -8(%rdi) - jrcxz .Lend -.Lb1: .byte 0xc4,98,171,0xf6,70,8 - .byte 0xf3,76,0x0f,0x38,0xf6,15 - lea -1(%rcx), %rcx - mov %r9, (%rdi) - .byte 0x66,76,0x0f,0x38,0xf6,208 -.Lb0: .byte 0xc4,226,179,0xf6,70,16 - .byte 0x66,77,0x0f,0x38,0xf6,200 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) -.Lb7: .byte 0xc4,98,171,0xf6,70,24 - lea 64(%rsi), %rsi - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xf3,76,0x0f,0x38,0xf6,79,16 - mov %r9, 16(%rdi) -.Lb6: .byte 0xc4,226,179,0xf6,70,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, 24(%rdi) -.Lb5: .byte 0xc4,98,171,0xf6,70,232 - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xf3,76,0x0f,0x38,0xf6,79,32 - mov %r9, 32(%rdi) -.Lb4: .byte 0xc4,226,179,0xf6,70,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, 40(%rdi) -.Lb3: .byte 0xf3,76,0x0f,0x38,0xf6,79,48 - .byte 0xc4,98,171,0xf6,70,248 - mov %r9, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xc4,226,179,0xf6,6 - jmp .Ltop - -.Lf7: .byte 0xc4,226,179,0xf6,6 - lea -16(%rsi), %rsi - lea -16(%rdi), %rdi - jmp .Lb7 - .size __gmpn_addmul_1,.-__gmpn_addmul_1 + mov (%rsi), %rax + push %rbx + mov %rdx, %rbx + + mul %rcx + mov %rbx, %r11 + + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jz .Lb2 + jg .Lb3 + +.Lb1: dec %r11 + jne .Lgt1 + add %rax, (%rdi) + jmp .Lret +.Lgt1: lea 8(%rsi,%r11,8), %rsi + lea -8(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (%rsi,%r11,8), %rax + mov %rdx, %r8 + jmp .LL1 + +.Lb0: lea (%rsi,%r11,8), %rsi + lea -16(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp .LL0 + +.Lb3: lea -8(%rsi,%r11,8), %rsi + lea -24(%rdi,%r11,8), %rdi + neg %r11 + mov %rax, %rbx + mov %rdx, %r10 + jmp .LL3 + +.Lb2: lea -16(%rsi,%r11,8), %rsi + lea -32(%rdi,%r11,8), %rdi + neg %r11 + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %rdx, %r9 + jmp .LL2 + + .align 16, 0x90 +.Ltop: add %r10, (%rdi,%r11,8) + adc %rax, %r9 + mov (%rsi,%r11,8), %rax + adc %rdx, %r8 + mov $0, %r10d +.LL1: mul %rcx + add %r9, 8(%rdi,%r11,8) + adc %rax, %r8 + adc %rdx, %rbx +.LL0: mov 8(%rsi,%r11,8), %rax + mul %rcx + add %r8, 16(%rdi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 +.LL3: mov 16(%rsi,%r11,8), %rax + mul %rcx + add %rbx, 24(%rdi,%r11,8) + mov $0, %r8d + mov %r8, %rbx + adc %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %r8, %r9 + adc %rdx, %r9 +.LL2: mul %rcx + add $4, %r11 + js .Ltop + + add %r10, (%rdi,%r11,8) + adc %rax, %r9 + adc %r8, %rdx + add %r9, 8(%rdi,%r11,8) +.Lret: adc $0, %rdx + mov %rdx, %rax + + pop %rbx + + + ret + .size __gmpn_addmul_1,.-__gmpn_addmul_1 diff --git a/ext/gmp/gen/x86_64-linux/mpn/addmul_2.s b/ext/gmp/gen/x86_64-linux/mpn/addmul_2.s index 7fd478bd41..5883dab926 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/addmul_2.s +++ b/ext/gmp/gen/x86_64-linux/mpn/addmul_2.s @@ -83,171 +83,125 @@ + .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_addmul_2 .type __gmpn_addmul_2,@function __gmpn_addmul_2: + mov %rdx, %r11 push %rbx push %rbp - push %r12 - push %r13 - mov (%rcx), %r8 + mov 0(%rcx), %r8 mov 8(%rcx), %r9 - mov %rdx, %r11 - shr $2, %r11 - - test $1, %dl - jnz .Lbx1 - -.Lbx0: mov (%rdi), %r12 - mov 8(%rdi), %r13 - test $2, %dl - jnz .Lb10 - -.Lb00: mov (%rsi), %rdx - lea 16(%rsi), %rsi - .byte 0xc4,194,251,0xf6,200 - add %rax, %r12 - .byte 0xc4,194,251,0xf6,233 - adc $0, %rcx - mov %r12, (%rdi) - add %rax, %r13 - adc $0, %rbp - mov -8(%rsi), %rdx - lea 16(%rdi), %rdi - jmp .Llo0 + mov %edx, %ebx + mov (%rsi), %rax + lea -8(%rsi,%rdx,8), %rsi + lea -8(%rdi,%rdx,8), %rdi + mul %r8 + neg %r11 + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jc .Lb1 + jz .Lb2 + +.Lb3: mov %rax, %rcx + mov %rdx, %rbp + xor %r10d, %r10d + mov 8(%rsi,%r11,8), %rax + dec %r11 + jmp .Llo3 -.Lb10: mov (%rsi), %rdx - inc %r11 - .byte 0xc4,194,251,0xf6,200 - add %rax, %r12 - adc $0, %rcx - .byte 0xc4,194,251,0xf6,233 - mov %r12, (%rdi) - mov 16(%rdi), %r12 - add %rax, %r13 - adc $0, %rbp - xor %rbx, %rbx +.Lb2: mov %rax, %rbp + mov 8(%rsi,%r11,8), %rax + mov %rdx, %r10 + xor %ebx, %ebx + add $-2, %r11 jmp .Llo2 -.Lbx1: mov (%rdi), %r13 - mov 8(%rdi), %r12 - test $2, %dl - jnz .Lb11 - -.Lb01: mov (%rsi), %rdx - .byte 0xc4,66,251,0xf6,208 - add %rax, %r13 - adc $0, %r10 - .byte 0xc4,194,251,0xf6,217 - add %rax, %r12 - adc $0, %rbx - mov 8(%rsi), %rdx - mov %r13, (%rdi) - mov 16(%rdi), %r13 - .byte 0xc4,194,251,0xf6,200 - lea 24(%rdi), %rdi - lea 24(%rsi), %rsi +.Lb1: mov %rax, %r10 + mov 8(%rsi,%r11,8), %rax + mov %rdx, %rbx + xor %ecx, %ecx + inc %r11 jmp .Llo1 -.Lb11: mov (%rsi), %rdx - inc %r11 - .byte 0xc4,66,251,0xf6,208 - add %rax, %r13 - adc $0, %r10 - .byte 0xc4,194,251,0xf6,217 - add %rax, %r12 - adc $0, %rbx - mov %r13, (%rdi) - mov 8(%rsi), %rdx - .byte 0xc4,194,251,0xf6,200 - lea 8(%rdi), %rdi - lea 8(%rsi), %rsi - jmp .Llo3 +.Lb0: mov $0, %r10d + mov %rax, %rbx + mov 8(%rsi,%r11,8), %rax + mov %rdx, %rcx + xor %ebp, %ebp + jmp .Llo0 + + .align 32, 0x90 +.Ltop: mov $0, %ecx + mul %r8 + add %rax, %r10 + mov (%rsi,%r11,8), %rax + adc %rdx, %rbx + adc $0, %ecx +.Llo1: mul %r9 + add %r10, (%rdi,%r11,8) + mov $0, %r10d + adc %rax, %rbx + mov $0, %ebp + mov 8(%rsi,%r11,8), %rax + adc %rdx, %rcx + mul %r8 + add %rax, %rbx + mov 8(%rsi,%r11,8), %rax + adc %rdx, %rcx + adc $0, %ebp +.Llo0: mul %r9 + add %rbx, 8(%rdi,%r11,8) + adc %rax, %rcx + adc %rdx, %rbp + mov 16(%rsi,%r11,8), %rax + mul %r8 + add %rax, %rcx + adc %rdx, %rbp + adc $0, %r10d + mov 16(%rsi,%r11,8), %rax +.Llo3: mul %r9 + add %rcx, 16(%rdi,%r11,8) + adc %rax, %rbp + adc %rdx, %r10 + xor %ebx, %ebx + mov 24(%rsi,%r11,8), %rax + mul %r8 + add %rax, %rbp + mov 24(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc $0, %ebx +.Llo2: mul %r9 + add %rbp, 24(%rdi,%r11,8) + adc %rax, %r10 + adc %rdx, %rbx + mov 32(%rsi,%r11,8), %rax + add $4, %r11 + js .Ltop + +.Lend: xor %ecx, %ecx + mul %r8 + add %rax, %r10 + mov (%rsi), %rax + adc %rdx, %rbx + adc %ecx, %ecx + mul %r9 + add %r10, (%rdi) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%rdi) + mov %rcx, %rax - .align 16, 0x90 -.Ltop: .byte 0xc4,66,251,0xf6,208 - add %rbx, %r13 - adc $0, %rbp - add %rax, %r13 - adc $0, %r10 - .byte 0xc4,194,251,0xf6,217 - add %rax, %r12 - adc $0, %rbx - lea 32(%rdi), %rdi - add %rcx, %r13 - mov -16(%rsi), %rdx - mov %r13, -24(%rdi) - adc $0, %r10 - add %rbp, %r12 - mov -8(%rdi), %r13 - .byte 0xc4,194,251,0xf6,200 - adc $0, %rbx -.Llo1: add %rax, %r12 - .byte 0xc4,194,251,0xf6,233 - adc $0, %rcx - add %r10, %r12 - mov %r12, -16(%rdi) - adc $0, %rcx - add %rax, %r13 - adc $0, %rbp - add %rbx, %r13 - mov -8(%rsi), %rdx - adc $0, %rbp -.Llo0: .byte 0xc4,66,251,0xf6,208 - add %rax, %r13 - adc $0, %r10 - mov (%rdi), %r12 - .byte 0xc4,194,251,0xf6,217 - add %rax, %r12 - adc $0, %rbx - add %rcx, %r13 - mov %r13, -8(%rdi) - adc $0, %r10 - mov (%rsi), %rdx - add %rbp, %r12 - .byte 0xc4,194,251,0xf6,200 - adc $0, %rbx -.Llo3: add %rax, %r12 - adc $0, %rcx - .byte 0xc4,194,251,0xf6,233 - add %r10, %r12 - mov 8(%rdi), %r13 - mov %r12, (%rdi) - mov 16(%rdi), %r12 - adc $0, %rcx - add %rax, %r13 - adc $0, %rbp -.Llo2: mov 8(%rsi), %rdx - lea 32(%rsi), %rsi - dec %r11 - jnz .Ltop - -.Lend: .byte 0xc4,66,251,0xf6,208 - add %rbx, %r13 - adc $0, %rbp - add %rax, %r13 - adc $0, %r10 - .byte 0xc4,194,235,0xf6,193 - add %rcx, %r13 - mov %r13, 8(%rdi) - adc $0, %r10 - add %rbp, %rdx - adc $0, %rax - add %r10, %rdx - mov %rdx, 16(%rdi) - adc $0, %rax - - pop %r13 - pop %r12 pop %rbp pop %rbx diff --git a/ext/gmp/gen/x86_64-linux/mpn/and_n.s b/ext/gmp/gen/x86_64-linux/mpn/and_n.s index 0bdc08b1fb..946906ecf6 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/and_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/and_n.s @@ -90,7 +90,6 @@ - .text @@ -103,50 +102,42 @@ __gmpn_and_n: mov (%rdx), %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: and (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: and (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx +.Lb01: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 -.Lb00: mov 8(%rdx), %r9 - and (%rsi), %r8 - and 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 -.Le10: mov 24(%rdx), %r9 - lea 32(%rdx), %rdx - and 16(%rsi), %r8 - and 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + and (%rsi,%rcx,8), %r8 + and 8(%rsi,%rcx,8), %r9 + nop + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + and 16(%rsi,%rcx,8), %r8 + and 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/andn_n.s b/ext/gmp/gen/x86_64-linux/mpn/andn_n.s index 73fe85c5fd..aee1df4efc 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/andn_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/andn_n.s @@ -92,7 +92,6 @@ - .text @@ -106,54 +105,46 @@ __gmpn_andn_n: mov (%rdx), %r8 not %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: and (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: and (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 +.Ltop: mov (%rdx,%rcx,8), %r8 not %r8 -.Lb00: mov 8(%rdx), %r9 +.Lb00: mov 8(%rdx,%rcx,8), %r9 not %r9 - and (%rsi), %r8 - and 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 + and (%rsi,%rcx,8), %r8 + and 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 not %r8 -.Le10: mov 24(%rdx), %r9 +.Le10: mov 24(%rdx,%rcx,8), %r9 not %r9 - lea 32(%rdx), %rdx - and 16(%rsi), %r8 - and 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + and 16(%rsi,%rcx,8), %r8 + and 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s b/ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s index 3c96e43ecb..4f58778551 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s @@ -145,63 +145,46 @@ __gmpn_pi1_bdiv_q_1: dec %r10 jz .Lone - lea 8(%rsi,%r10,8), %rsi + mov 8(%rsi), %rdx + lea (%rsi,%r10,8), %rsi lea (%rdi,%r10,8), %rdi neg %r10 - test %ecx, %ecx - jnz .Lunorm + shrd %cl, %rdx, %rax + xor %ebx, %ebx - jmp .Lnent + jmp .Lent .align 8, 0x90 -.Lntop:mul %r11 - mov -8(%rsi,%r10,8), %rax +.Ltop: + + + + + + + + mul %r11 + mov (%rsi,%r10,8), %rax + mov 8(%rsi,%r10,8), %r9 + shrd %cl, %r9, %rax + nop sub %rbx, %rax - setc %bl + setc %bl sub %rdx, %rax - adc $0, %ebx -.Lnent:imul %r8, %rax - mov %rax, (%rdi,%r10,8) - inc %r10 - jnz .Lntop - - mov -8(%rsi), %r9 - jmp .Lcom - -.Lunorm: - mov (%rsi,%r10,8), %r9 - shr %cl, %rax - neg %ecx - shl %cl, %r9 - neg %ecx - or %r9, %rax - xor %ebx, %ebx - jmp .Luent - - .align 8, 0x90 -.Lutop:mul %r11 - mov (%rsi,%r10,8), %rax - shl %cl, %rax - neg %ecx - or %r9, %rax + adc $0, %ebx +.Lent: imul %r8, %rax + mov %rax, (%rdi,%r10,8) + inc %r10 + jnz .Ltop + + mul %r11 + mov (%rsi), %rax + shr %cl, %rax sub %rbx, %rax - setc %bl sub %rdx, %rax - adc $0, %ebx -.Luent:imul %r8, %rax - mov (%rsi,%r10,8), %r9 - shr %cl, %r9 - neg %ecx - mov %rax, (%rdi,%r10,8) - inc %r10 - jnz .Lutop - -.Lcom: mul %r11 - sub %rbx, %r9 - sub %rdx, %r9 - imul %r8, %r9 - mov %r9, (%rdi) + imul %r8, %rax + mov %rax, (%rdi) pop %rbx ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s b/ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s index 063d5dc7d7..b046e3642c 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s @@ -60,11 +60,6 @@ - - - - - @@ -73,7 +68,9 @@ - + + + @@ -92,92 +89,101 @@ __gmpn_cnd_add_n: push %rbx + push %rbp + push %r12 + push %r13 + push %r14 neg %rdi - sbb %rbx, %rbx - - test $1, %r8b - jz .Lx0 -.Lx1: test $2, %r8b - jz .Lb1 - -.Lb3: mov (%rcx), %rdi - mov 8(%rcx), %r9 - mov 16(%rcx), %r10 - and %rbx, %rdi - and %rbx, %r9 - and %rbx, %r10 - add (%rdx), %rdi - mov %rdi, (%rsi) - adc 8(%rdx), %r9 - mov %r9, 8(%rsi) - adc 16(%rdx), %r10 - mov %r10, 16(%rsi) + sbb %rdi, %rdi + + lea (%rcx,%r8,8), %rcx + lea (%rdx,%r8,8), %rdx + lea (%rsi,%r8,8), %rsi + + mov %r8d, %eax + neg %r8 + and $3, %eax + jz .Ltop + cmp $2, %eax + jc .Lb1 + jz .Lb2 + +.Lb3: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + add %r12, %r10 + mov %r10, (%rsi,%r8,8) + adc %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + adc %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) sbb %eax, %eax - lea 24(%rdx), %rdx - lea 24(%rcx), %rcx - lea 24(%rsi), %rsi - sub $3, %r8 - jnz .Ltop + add $3, %r8 + js .Ltop jmp .Lend -.Lx0: xor %eax, %eax - test $2, %r8b - jz .Ltop - -.Lb2: mov (%rcx), %rdi - mov 8(%rcx), %r9 - and %rbx, %rdi - and %rbx, %r9 - add (%rdx), %rdi - mov %rdi, (%rsi) - adc 8(%rdx), %r9 - mov %r9, 8(%rsi) +.Lb2: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r13 + add %r12, %r10 + mov %r10, (%rsi,%r8,8) + adc %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) sbb %eax, %eax - lea 16(%rdx), %rdx - lea 16(%rcx), %rcx - lea 16(%rsi), %rsi - sub $2, %r8 - jnz .Ltop + add $2, %r8 + js .Ltop jmp .Lend -.Lb1: mov (%rcx), %rdi - and %rbx, %rdi - add (%rdx), %rdi - mov %rdi, (%rsi) +.Lb1: mov (%rcx,%r8,8), %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + add %r12, %r10 + mov %r10, (%rsi,%r8,8) sbb %eax, %eax - lea 8(%rdx), %rdx - lea 8(%rcx), %rcx - lea 8(%rsi), %rsi - dec %r8 - jz .Lend + add $1, %r8 + jns .Lend .align 16, 0x90 -.Ltop: mov (%rcx), %rdi - mov 8(%rcx), %r9 - mov 16(%rcx), %r10 - mov 24(%rcx), %r11 - lea 32(%rcx), %rcx - and %rbx, %rdi - and %rbx, %r9 - and %rbx, %r10 - and %rbx, %r11 +.Ltop: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + mov 24(%rcx,%r8,8), %r11 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + and %rdi, %r11 + mov 24(%rdx,%r8,8), %r9 add %eax, %eax - adc (%rdx), %rdi - mov %rdi, (%rsi) - adc 8(%rdx), %r9 - mov %r9, 8(%rsi) - adc 16(%rdx), %r10 - mov %r10, 16(%rsi) - adc 24(%rdx), %r11 - lea 32(%rdx), %rdx - mov %r11, 24(%rsi) - lea 32(%rsi), %rsi + adc %r12, %r10 + mov %r10, (%rsi,%r8,8) + adc %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + adc %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) + adc %r11, %r9 + mov %r9, 24(%rsi,%r8,8) sbb %eax, %eax - sub $4, %r8 - jnz .Ltop + add $4, %r8 + js .Ltop .Lend: neg %eax + pop %r14 + pop %r13 + pop %r12 + pop %rbp pop %rbx ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s b/ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s index 40b0e30be4..596dd8fd48 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s @@ -60,14 +60,6 @@ - - - - - - - - @@ -75,8 +67,6 @@ - - @@ -102,102 +92,95 @@ __gmpn_cnd_sub_n: push %rbp push %r12 push %r13 + push %r14 neg %rdi - sbb %rbx, %rbx - - test $1, %r8b - jz .Lx0 -.Lx1: test $2, %r8b - jz .Lb1 - -.Lb3: mov (%rcx), %rdi - mov 8(%rcx), %r9 - mov 16(%rcx), %r10 - and %rbx, %rdi - mov (%rdx), %r12 - and %rbx, %r9 - mov 8(%rdx), %r13 - and %rbx, %r10 - mov 16(%rdx), %rbp - sub %rdi, %r12 - mov %r12, (%rsi) - sbb %r9, %r13 - mov %r13, 8(%rsi) - sbb %r10, %rbp - mov %rbp, 16(%rsi) + sbb %rdi, %rdi + + lea (%rcx,%r8,8), %rcx + lea (%rdx,%r8,8), %rdx + lea (%rsi,%r8,8), %rsi + + mov %r8d, %eax + neg %r8 + and $3, %eax + jz .Ltop + cmp $2, %eax + jc .Lb1 + jz .Lb2 + +.Lb3: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + sub %r12, %r10 + mov %r10, (%rsi,%r8,8) + sbb %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + sbb %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) sbb %eax, %eax - lea 24(%rdx), %rdx - lea 24(%rcx), %rcx - lea 24(%rsi), %rsi - sub $3, %r8 - jnz .Ltop + add $3, %r8 + js .Ltop jmp .Lend -.Lx0: xor %eax, %eax - test $2, %r8b - jz .Ltop - -.Lb2: mov (%rcx), %rdi - mov 8(%rcx), %r9 - mov (%rdx), %r12 - and %rbx, %rdi - mov 8(%rdx), %r13 - and %rbx, %r9 - sub %rdi, %r12 - mov %r12, (%rsi) - sbb %r9, %r13 - mov %r13, 8(%rsi) +.Lb2: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r13 + sub %r12, %r10 + mov %r10, (%rsi,%r8,8) + sbb %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) sbb %eax, %eax - lea 16(%rdx), %rdx - lea 16(%rcx), %rcx - lea 16(%rsi), %rsi - sub $2, %r8 - jnz .Ltop + add $2, %r8 + js .Ltop jmp .Lend -.Lb1: mov (%rcx), %rdi - mov (%rdx), %r12 - and %rbx, %rdi - sub %rdi, %r12 - mov %r12, (%rsi) +.Lb1: mov (%rcx,%r8,8), %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + sub %r12, %r10 + mov %r10, (%rsi,%r8,8) sbb %eax, %eax - lea 8(%rdx), %rdx - lea 8(%rcx), %rcx - lea 8(%rsi), %rsi - dec %r8 - jz .Lend + add $1, %r8 + jns .Lend .align 16, 0x90 -.Ltop: mov (%rcx), %rdi - mov 8(%rcx), %r9 - mov 16(%rcx), %r10 - mov 24(%rcx), %r11 - lea 32(%rcx), %rcx - and %rbx, %rdi - mov (%rdx), %r12 - and %rbx, %r9 - mov 8(%rdx), %r13 - and %rbx, %r10 - mov 16(%rdx), %rbp - and %rbx, %r11 +.Ltop: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + mov 24(%rcx,%r8,8), %r11 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + and %rdi, %r11 + mov 24(%rdx,%r8,8), %r9 add %eax, %eax - mov 24(%rdx), %rax - lea 32(%rdx), %rdx - sbb %rdi, %r12 - mov %r12, (%rsi) - sbb %r9, %r13 - mov %r13, 8(%rsi) - sbb %r10, %rbp - mov %rbp, 16(%rsi) - sbb %r11, %rax - mov %rax, 24(%rsi) - lea 32(%rsi), %rsi + sbb %r12, %r10 + mov %r10, (%rsi,%r8,8) + sbb %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + sbb %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) + sbb %r11, %r9 + mov %r9, 24(%rsi,%r8,8) sbb %eax, %eax - sub $4, %r8 - jnz .Ltop + add $4, %r8 + js .Ltop .Lend: neg %eax + pop %r14 pop %r13 pop %r12 pop %rbp diff --git a/ext/gmp/gen/x86_64-linux/mpn/com.s b/ext/gmp/gen/x86_64-linux/mpn/com.s index 9d4f49cfc0..ff14001990 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/com.s +++ b/ext/gmp/gen/x86_64-linux/mpn/com.s @@ -39,44 +39,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -99,237 +61,50 @@ .text - .align 64, 0x90 + .align 32, 0x90 .globl __gmpn_com .type __gmpn_com,@function __gmpn_com: - - cmp $7, %rdx - jbe .Lbc - - pcmpeqb %xmm5, %xmm5 - - test $8, %dil - jz .Lrp_aligned - - mov (%rsi), %r8 - lea 8(%rsi), %rsi - not %r8 - mov %r8, (%rdi) - lea 8(%rdi), %rdi - dec %rdx - -.Lrp_aligned: - test $8, %sil - jnz .Luent - - jmp .Lam - - .align 16, 0x90 -.Latop:movaps 0(%rsi), %xmm0 - movaps 16(%rsi), %xmm1 - movaps 32(%rsi), %xmm2 - movaps 48(%rsi), %xmm3 - lea 64(%rsi), %rsi - pxor %xmm5, %xmm0 - pxor %xmm5, %xmm1 - pxor %xmm5, %xmm2 - pxor %xmm5, %xmm3 - movaps %xmm0, (%rdi) - movaps %xmm1, 16(%rdi) - movaps %xmm2, 32(%rdi) - movaps %xmm3, 48(%rdi) - lea 64(%rdi), %rdi -.Lam: sub $8, %rdx - jnc .Latop - - test $4, %dl - jz 1f - movaps (%rsi), %xmm0 - movaps 16(%rsi), %xmm1 - lea 32(%rsi), %rsi - pxor %xmm5, %xmm0 - pxor %xmm5, %xmm1 - movaps %xmm0, (%rdi) - movaps %xmm1, 16(%rdi) - lea 32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movaps (%rsi), %xmm0 - lea 16(%rsi), %rsi - pxor %xmm5, %xmm0 - movaps %xmm0, (%rdi) - lea 16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - not %r8 - mov %r8, (%rdi) - -1: - ret - -.Luent: - - - - - lea -40(%rsi), %rax - sub %rdi, %rax - cmp $80, %rax - jbe .Lbc - - sub $16, %rdx - jc .Luend - - movaps 120(%rsi), %xmm3 - - sub $16, %rdx - jmp .Lum - - .align 16, 0x90 -.Lutop:movaps 120(%rsi), %xmm3 - pxor %xmm5, %xmm0 - movaps %xmm0, -128(%rdi) - sub $16, %rdx -.Lum: movaps 104(%rsi), %xmm2 - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movaps 88(%rsi), %xmm1 - pxor %xmm5, %xmm3 - movaps %xmm3, 112(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movaps 72(%rsi), %xmm0 - pxor %xmm5, %xmm2 - movaps %xmm2, 96(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps 56(%rsi), %xmm3 - pxor %xmm5, %xmm1 - movaps %xmm1, 80(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,195,8 - movaps 40(%rsi), %xmm2 - pxor %xmm5, %xmm0 - movaps %xmm0, 64(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movaps 24(%rsi), %xmm1 - pxor %xmm5, %xmm3 - movaps %xmm3, 48(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movaps 8(%rsi), %xmm0 - pxor %xmm5, %xmm2 - movaps %xmm2, 32(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps -8(%rsi), %xmm3 - pxor %xmm5, %xmm1 - movaps %xmm1, 16(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 128(%rsi), %rsi - lea 128(%rdi), %rdi - jnc .Lutop - - pxor %xmm5, %xmm0 - movaps %xmm0, -128(%rdi) - -.Luend:test $8, %dl - jz 1f - movaps 56(%rsi), %xmm3 - movaps 40(%rsi), %xmm2 - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movaps 24(%rsi), %xmm1 - pxor %xmm5, %xmm3 - movaps %xmm3, 48(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movaps 8(%rsi), %xmm0 - pxor %xmm5, %xmm2 - movaps %xmm2, 32(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps -8(%rsi), %xmm3 - pxor %xmm5, %xmm1 - movaps %xmm1, 16(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 64(%rsi), %rsi - pxor %xmm5, %xmm0 - movaps %xmm0, (%rdi) - lea 64(%rdi), %rdi - -1: test $4, %dl - jz 1f - movaps 24(%rsi), %xmm1 - movaps 8(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps -8(%rsi), %xmm3 - pxor %xmm5, %xmm1 - movaps %xmm1, 16(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 32(%rsi), %rsi - pxor %xmm5, %xmm0 - movaps %xmm0, (%rdi) - lea 32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movaps 8(%rsi), %xmm0 - movaps -8(%rsi), %xmm3 - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 16(%rsi), %rsi - pxor %xmm5, %xmm0 - movaps %xmm0, (%rdi) - lea 16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - not %r8 - mov %r8, (%rdi) - -1: - ret - - - - -.Lbc: lea -8(%rdi), %rdi - sub $4, %edx - jc .Lend - - .align 16, 0x90 -.Ltop: mov (%rsi), %r8 - mov 8(%rsi), %r9 - lea 32(%rdi), %rdi - mov 16(%rsi), %r10 - mov 24(%rsi), %r11 - lea 32(%rsi), %rsi - not %r8 - not %r9 - not %r10 - not %r11 - mov %r8, -24(%rdi) - mov %r9, -16(%rdi) - sub $4, %edx - mov %r10, -8(%rdi) - mov %r11, (%rdi) - jnc .Ltop - -.Lend: test $1, %dl - jz 1f - mov (%rsi), %r8 - not %r8 - mov %r8, 8(%rdi) - lea 8(%rdi), %rdi - lea 8(%rsi), %rsi -1: test $2, %dl - jz 1f - mov (%rsi), %r8 - mov 8(%rsi), %r9 - not %r8 - not %r9 - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) -1: + movq (%rsi), %r8 + movl %edx, %eax + leaq (%rsi,%rdx,8), %rsi + leaq (%rdi,%rdx,8), %rdi + negq %rdx + andl $3, %eax + je .Lb00 + cmpl $2, %eax + jc .Lb01 + je .Lb10 + +.Lb11: notq %r8 + movq %r8, (%rdi,%rdx,8) + decq %rdx + jmp .Le11 +.Lb10: addq $-2, %rdx + jmp .Le10 + .byte 0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: notq %r8 + movq %r8, (%rdi,%rdx,8) + incq %rdx + jz .Lret + +.Loop: movq (%rsi,%rdx,8), %r8 +.Lb00: movq 8(%rsi,%rdx,8), %r9 + notq %r8 + notq %r9 + movq %r8, (%rdi,%rdx,8) + movq %r9, 8(%rdi,%rdx,8) +.Le11: movq 16(%rsi,%rdx,8), %r8 +.Le10: movq 24(%rsi,%rdx,8), %r9 + notq %r8 + notq %r9 + movq %r8, 16(%rdi,%rdx,8) + movq %r9, 24(%rdi,%rdx,8) + addq $4, %rdx + jnc .Loop +.Lret: ret .size __gmpn_com,.-__gmpn_com - diff --git a/ext/gmp/gen/x86_64-linux/mpn/copyd.s b/ext/gmp/gen/x86_64-linux/mpn/copyd.s index 583e8c9ec5..f375481084 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/copyd.s +++ b/ext/gmp/gen/x86_64-linux/mpn/copyd.s @@ -45,35 +45,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -102,178 +73,36 @@ __gmpn_copyd: - - lea -8(%rsi,%rdx,8), %rsi - lea -8(%rdi,%rdx,8), %rdi - - cmp $7, %rdx - jbe .Lbc - - test $8, %dil - jnz .Lrp_aligned - - mov (%rsi), %rax - mov %rax, (%rdi) - lea -8(%rsi), %rsi - lea -8(%rdi), %rdi - dec %rdx - -.Lrp_aligned: - test $8, %sil - jz .Luent - - jmp .Lam - - .align 16, 0x90 -.Latop:movaps -8(%rsi), %xmm0 - movaps -24(%rsi), %xmm1 - movaps -40(%rsi), %xmm2 - movaps -56(%rsi), %xmm3 - lea -64(%rsi), %rsi - movaps %xmm0, -8(%rdi) - movaps %xmm1, -24(%rdi) - movaps %xmm2, -40(%rdi) - movaps %xmm3, -56(%rdi) - lea -64(%rdi), %rdi -.Lam: sub $8, %rdx - jnc .Latop - - test $4, %dl - jz 1f - movaps -8(%rsi), %xmm0 - movaps -24(%rsi), %xmm1 - lea -32(%rsi), %rsi - movaps %xmm0, -8(%rdi) - movaps %xmm1, -24(%rdi) - lea -32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movaps -8(%rsi), %xmm0 - lea -16(%rsi), %rsi - movaps %xmm0, -8(%rdi) - lea -16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, (%rdi) - -1: - ret - -.Luent:sub $16, %rdx - movaps (%rsi), %xmm0 - jc .Luend - - .align 16, 0x90 -.Lutop:sub $16, %rdx - movaps -16(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -8(%rdi) - movaps -32(%rsi), %xmm2 - .byte 0x66,0x0f,0x3a,0x0f,202,8 - movaps %xmm1, -24(%rdi) - movaps -48(%rsi), %xmm3 - .byte 0x66,0x0f,0x3a,0x0f,211,8 - movaps %xmm2, -40(%rdi) - movaps -64(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,216,8 - movaps %xmm3, -56(%rdi) - movaps -80(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -72(%rdi) - movaps -96(%rsi), %xmm2 - .byte 0x66,0x0f,0x3a,0x0f,202,8 - movaps %xmm1, -88(%rdi) - movaps -112(%rsi), %xmm3 - .byte 0x66,0x0f,0x3a,0x0f,211,8 - movaps %xmm2, -104(%rdi) - movaps -128(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,216,8 - movaps %xmm3, -120(%rdi) - lea -128(%rsi), %rsi - lea -128(%rdi), %rdi - jnc .Lutop - -.Luend:test $8, %dl - jz 1f - movaps -16(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -8(%rdi) - movaps -32(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps %xmm1, -24(%rdi) - movaps -48(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -40(%rdi) - movaps -64(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps %xmm1, -56(%rdi) - lea -64(%rsi), %rsi - lea -64(%rdi), %rdi - -1: test $4, %dl - jz 1f - movaps -16(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -8(%rdi) - movaps -32(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps %xmm1, -24(%rdi) - lea -32(%rsi), %rsi - lea -32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movaps -16(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -8(%rdi) - lea -16(%rsi), %rsi - lea -16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, (%rdi) - -1: - ret - - - - -.Lbc: sub $4, %edx + lea (%rdi,%rdx,8), %rdi + sub $4, %rdx jc .Lend + nop - .align 16, 0x90 -.Ltop: mov (%rsi), %r8 +.Ltop: mov (%rsi), %rax mov -8(%rsi), %r9 lea -32(%rdi), %rdi mov -16(%rsi), %r10 mov -24(%rsi), %r11 lea -32(%rsi), %rsi - mov %r8, 32(%rdi) - mov %r9, 24(%rdi) - - mov %r10, 16(%rdi) - mov %r11, 8(%rdi) - - -.Lend: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, (%rdi) + mov %rax, 24(%rdi) + mov %r9, 16(%rdi) + sub $4, %rdx + mov %r10, 8(%rdi) + mov %r11, (%rdi) + jnc .Ltop + +.Lend: shr %edx + jnc 1f + mov (%rsi), %rax + mov %rax, -8(%rdi) lea -8(%rdi), %rdi lea -8(%rsi), %rsi -1: test $2, %dl - jz 1f - mov (%rsi), %r8 +1: shr %edx + jnc 1f + mov (%rsi), %rax mov -8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, -8(%rdi) -1: - ret + mov %rax, -8(%rdi) + mov %r9, -16(%rdi) +1: ret .size __gmpn_copyd,.-__gmpn_copyd - diff --git a/ext/gmp/gen/x86_64-linux/mpn/copyi.s b/ext/gmp/gen/x86_64-linux/mpn/copyi.s index a5c971baa6..dc746b2270 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/copyi.s +++ b/ext/gmp/gen/x86_64-linux/mpn/copyi.s @@ -45,38 +45,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -100,225 +68,40 @@ .text .align 64, 0x90 + .byte 0,0,0,0,0,0 .globl __gmpn_copyi .type __gmpn_copyi,@function __gmpn_copyi: - - - cmp $7, %rdx - jbe .Lbc - - test $8, %dil - jz .Lrp_aligned - - movsq - dec %rdx - -.Lrp_aligned: - test $8, %sil - jnz .Luent - - jmp .Lam - - .align 16, 0x90 -.Latop:movdqa 0(%rsi), %xmm0 - movdqa 16(%rsi), %xmm1 - movdqa 32(%rsi), %xmm2 - movdqa 48(%rsi), %xmm3 - lea 64(%rsi), %rsi - movdqa %xmm0, (%rdi) - movdqa %xmm1, 16(%rdi) - movdqa %xmm2, 32(%rdi) - movdqa %xmm3, 48(%rdi) - lea 64(%rdi), %rdi -.Lam: sub $8, %rdx - jnc .Latop - - test $4, %dl - jz 1f - movdqa (%rsi), %xmm0 - movdqa 16(%rsi), %xmm1 - lea 32(%rsi), %rsi - movdqa %xmm0, (%rdi) - movdqa %xmm1, 16(%rdi) - lea 32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movdqa (%rsi), %xmm0 - lea 16(%rsi), %rsi - movdqa %xmm0, (%rdi) - lea 16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, (%rdi) - -1: - ret - -.Luent: - - - cmp $16, %rdx - jc .Lued0 - - - - - - - movaps 120(%rsi), %xmm7 - movaps 104(%rsi), %xmm6 - movaps 88(%rsi), %xmm5 - movaps 72(%rsi), %xmm4 - movaps 56(%rsi), %xmm3 - movaps 40(%rsi), %xmm2 - lea 128(%rsi), %rsi - sub $32, %rdx - jc .Lued1 - - .align 16, 0x90 -.Lutop:movaps -104(%rsi), %xmm1 - sub $16, %rdx - movaps -120(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,254,8 - movaps -136(%rsi), %xmm8 - movdqa %xmm7, 112(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,245,8 - movaps 120(%rsi), %xmm7 - movdqa %xmm6, 96(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,236,8 - movaps 104(%rsi), %xmm6 - movdqa %xmm5, 80(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,227,8 - movaps 88(%rsi), %xmm5 - movdqa %xmm4, 64(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movaps 72(%rsi), %xmm4 - movdqa %xmm3, 48(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movaps 56(%rsi), %xmm3 - movdqa %xmm2, 32(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps 40(%rsi), %xmm2 - movdqa %xmm1, 16(%rdi) - .byte 0x66,65,0x0f,0x3a,0x0f,192,8 - lea 128(%rsi), %rsi - movdqa %xmm0, (%rdi) - lea 128(%rdi), %rdi - jnc .Lutop - -.Lued1:movaps -104(%rsi), %xmm1 - movaps -120(%rsi), %xmm0 - movaps -136(%rsi), %xmm8 - .byte 0x66,0x0f,0x3a,0x0f,254,8 - movdqa %xmm7, 112(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,245,8 - movdqa %xmm6, 96(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,236,8 - movdqa %xmm5, 80(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,227,8 - movdqa %xmm4, 64(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movdqa %xmm3, 48(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movdqa %xmm2, 32(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movdqa %xmm1, 16(%rdi) - .byte 0x66,65,0x0f,0x3a,0x0f,192,8 - movdqa %xmm0, (%rdi) - lea 128(%rdi), %rdi - - - - - - -.Lued0:test $8, %dl - jz 1f - movaps 56(%rsi), %xmm3 - movaps 40(%rsi), %xmm2 - movaps 24(%rsi), %xmm1 - movaps 8(%rsi), %xmm0 - movaps -8(%rsi), %xmm4 - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movdqa %xmm3, 48(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movdqa %xmm2, 32(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movdqa %xmm1, 16(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,196,8 - lea 64(%rsi), %rsi - movdqa %xmm0, (%rdi) - lea 64(%rdi), %rdi - -1: test $4, %dl - jz 1f - movaps 24(%rsi), %xmm1 - movaps 8(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps -8(%rsi), %xmm3 - movdqa %xmm1, 16(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 32(%rsi), %rsi - movdqa %xmm0, (%rdi) - lea 32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movdqa 8(%rsi), %xmm0 - movdqa -8(%rsi), %xmm3 - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 16(%rsi), %rsi - movdqa %xmm0, (%rdi) - lea 16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, (%rdi) - -1: - ret - - - - -.Lbc: lea -8(%rdi), %rdi - sub $4, %edx + lea -8(%rdi), %rdi + sub $4, %rdx jc .Lend - .align 16, 0x90 -.Ltop: mov (%rsi), %r8 +.Ltop: mov (%rsi), %rax mov 8(%rsi), %r9 lea 32(%rdi), %rdi mov 16(%rsi), %r10 mov 24(%rsi), %r11 lea 32(%rsi), %rsi - mov %r8, -24(%rdi) + mov %rax, -24(%rdi) mov %r9, -16(%rdi) - + sub $4, %rdx mov %r10, -8(%rdi) mov %r11, (%rdi) + jnc .Ltop - -.Lend: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, 8(%rdi) +.Lend: shr %edx + jnc 1f + mov (%rsi), %rax + mov %rax, 8(%rdi) lea 8(%rdi), %rdi lea 8(%rsi), %rsi -1: test $2, %dl - jz 1f - mov (%rsi), %r8 +1: shr %edx + jnc 1f + mov (%rsi), %rax mov 8(%rsi), %r9 - mov %r8, 8(%rdi) + mov %rax, 8(%rdi) mov %r9, 16(%rdi) -1: - ret +1: ret .size __gmpn_copyi,.-__gmpn_copyi - diff --git a/ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s b/ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s index 652beccbf2..fd8ce8e9e6 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s @@ -158,17 +158,18 @@ __gmpn_div_qr_1n_pi1: dec %r8 mov %rcx, %rax jz .Lfinal + mov $0, %r14d .align 16, 0x90 + .Lloop: - mov %r9, %r14 + cmovc %r9, %r14 mov %r12, %r15 - and %r12, %r14 neg %r15 mul %r9 add %rdx, %r14 @@ -195,6 +196,7 @@ __gmpn_div_qr_1n_pi1: mov %r10, %rax adc %rdx, %rax mov %r14, (%rdi, %r8, 8) + mov $0, %r14d sbb %r12, %r12 dec %r8 mov %rax, %rcx diff --git a/ext/gmp/gen/x86_64-linux/mpn/divrem_1.s b/ext/gmp/gen/x86_64-linux/mpn/divrem_1.s index 5363432e8d..e689bd27f4 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/divrem_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/divrem_1.s @@ -64,20 +64,6 @@ - - - - - - - - - - - - - - @@ -347,4 +333,3 @@ __gmpn_divrem_1: ret .size __gmpn_divrem_1,.-__gmpn_divrem_1 - diff --git a/ext/gmp/gen/x86_64-linux/mpn/gcd_11.s b/ext/gmp/gen/x86_64-linux/mpn/gcd_11.s index 4647639cd9..cf35d253b9 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/gcd_11.s +++ b/ext/gmp/gen/x86_64-linux/mpn/gcd_11.s @@ -76,13 +76,140 @@ - - - - - - - + .section .rodata + .align 64, 0x90 +ctz_table: + + .byte 7 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 6 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + + .size ctz_table,.-ctz_table @@ -99,22 +226,31 @@ __gmpn_gcd_11: - jmp .Lodd + mov ctz_table@GOTPCREL(%rip), %r8 + + + jmp .Lent .align 16, 0x90 .Ltop: cmovc %rdx, %rdi cmovc %rax, %rsi +.Lmid: and $127, %edx + movzbl (%r8,%rdx), %ecx + jz .Lshift_alot shr %cl, %rdi -.Lodd: mov %rsi, %rdx - sub %rdi, %rdx - bsf %rdx, %rcx - mov %rdi, %rax - sub %rsi, %rdi +.Lent: mov %rdi, %rax + mov %rsi, %rdx + sub %rdi, %rdx + sub %rsi, %rdi jnz .Ltop .Lend: ret - .size __gmpn_gcd_11,.-__gmpn_gcd_11 +.Lshift_alot: + shr $7, %rdi + mov %rdi, %rdx + jmp .Lmid + .size __gmpn_gcd_11,.-__gmpn_gcd_11 diff --git a/ext/gmp/gen/x86_64-linux/mpn/gcd_22.s b/ext/gmp/gen/x86_64-linux/mpn/gcd_22.s index e3d86b92e4..60f4c714c9 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/gcd_22.s +++ b/ext/gmp/gen/x86_64-linux/mpn/gcd_22.s @@ -67,6 +67,276 @@ + + + + + + + + + .section .rodata + .align 64, 0x90 +ctz_table: + + .byte 8 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 6 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 7 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 6 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + + .size ctz_table,.-ctz_table @@ -92,32 +362,40 @@ __gmpn_gcd_22: + mov %rcx, %rax + + mov ctz_table@GOTPCREL(%rip), %r10 + + .align 16, 0x90 -.Ltop: mov %rcx, %r10 - sub %rsi, %r10 +.Ltop: mov %rax, %rcx + sub %rsi, %rcx jz .Llowz mov %rdx, %r11 sbb %rdi, %r11 - rep;bsf %r10, %rax - mov %rsi, %r8 - sub %rcx, %rsi mov %rdi, %r9 + + sub %rax, %rsi sbb %rdx, %rdi -.Lbck: cmovc %r10, %rsi +.Lbck: cmovc %rcx, %rsi cmovc %r11, %rdi - cmovc %r8, %rcx + cmovc %r8, %rax cmovc %r9, %rdx - xor %r10d, %r10d - sub %rax, %r10 - .byte 0xc4,98,169,0xf7,207 - .byte 0xc4,226,251,0xf7,246 - .byte 0xc4,226,251,0xf7,255 - or %r9, %rsi + and $255, %ecx + movzbl (%r10,%rcx), %ecx + jz .Lcount_better + +.Lshr: shr %cl, %rsi + mov %rdi, %r11 + shr %cl, %rdi + neg %rcx + shl %cl, %r11 + or %r11, %rsi test %rdx, %rdx jnz .Ltop @@ -125,29 +403,32 @@ __gmpn_gcd_22: jnz .Ltop .Lgcd_11: - mov %rcx, %rdi + mov %rax, %rdi jmp __gmpn_gcd_11@PLT +.Lcount_better: + rep;bsf %rsi, %rcx + jmp .Lshr + .Llowz: - mov %rdx, %r10 - sub %rdi, %r10 + mov %rdx, %rcx + sub %rdi, %rcx je .Lend xor %r11, %r11 mov %rsi, %r8 mov %rdi, %r9 - rep;bsf %r10, %rax mov %rdi, %rsi xor %rdi, %rdi sub %rdx, %rsi jmp .Lbck -.Lend: mov %rcx, %rax +.Lend: + -.Lret: ret .size __gmpn_gcd_22,.-__gmpn_gcd_22 diff --git a/ext/gmp/gen/x86_64-linux/mpn/hamdist.s b/ext/gmp/gen/x86_64-linux/mpn/hamdist.s index 1c5d6e4192..1ab3a8cca6 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/hamdist.s +++ b/ext/gmp/gen/x86_64-linux/mpn/hamdist.s @@ -60,21 +60,16 @@ - - - - - - - - - - - - - - - + + + + + + + + + + @@ -91,119 +86,82 @@ __gmpn_hamdist: - + push %rbx + mov $0x5555555555555555, %r10 push %rbp - - mov (%rdi), %r10 - xor (%rsi), %r10 - - mov %edx, %r8d - and $3, %r8d - - xor %ecx, %ecx - .byte 0xf3,0x49,0x0f,0xb8,0xc2 - - lea .Ltab(%rip), %r9 - - movslq (%r9,%r8,4), %r8 - add %r9, %r8 - jmp *%r8 - - -.L3: mov 8(%rdi), %r10 - mov 16(%rdi), %r11 - xor 8(%rsi), %r10 - xor 16(%rsi), %r11 - xor %ebp, %ebp - sub $4, %rdx - jle .Lx3 - mov 24(%rdi), %r8 - mov 32(%rdi), %r9 - add $24, %rdi - add $24, %rsi - jmp .Le3 - -.L0: mov 8(%rdi), %r9 - xor 8(%rsi), %r9 - mov 16(%rdi), %r10 - mov 24(%rdi), %r11 - xor %ebx, %ebx - xor 16(%rsi), %r10 - xor 24(%rsi), %r11 - add $32, %rdi - add $32, %rsi - sub $4, %rdx - jle .Lx4 + mov $0x3333333333333333, %r11 + push %r12 + lea (%rdi,%rdx,8), %rdi + mov $0x0f0f0f0f0f0f0f0f, %rcx + lea (%rsi,%rdx,8), %rsi + neg %rdx + mov $0x0101010101010101, %r12 + xor %eax, %eax + test $1, %dl + jz .Ltop + + mov (%rdi,%rdx,8), %r8 + xor (%rsi,%rdx,8), %r8 + + mov %r8, %r9 + shr %r8 + and %r10, %r8 + sub %r8, %r9 + + mov %r9, %r8 + shr $2, %r9 + and %r11, %r8 + and %r11, %r9 + add %r8, %r9 + + dec %rdx + jmp .Lmid .align 16, 0x90 -.Ltop: -.Le0: .byte 0xf3,0x49,0x0f,0xb8,0xe9 - mov (%rdi), %r8 - mov 8(%rdi), %r9 - add %rbx, %rax -.Le3: .byte 0xf3,0x49,0x0f,0xb8,0xda - xor (%rsi), %r8 - xor 8(%rsi), %r9 - add %rbp, %rcx -.Le2: .byte 0xf3,0x49,0x0f,0xb8,0xeb - mov 16(%rdi), %r10 - mov 24(%rdi), %r11 - add $32, %rdi - add %rbx, %rax -.Le1: .byte 0xf3,0x49,0x0f,0xb8,0xd8 - xor 16(%rsi), %r10 - xor 24(%rsi), %r11 - add $32, %rsi - add %rbp, %rcx - sub $4, %rdx - jg .Ltop - -.Lx4: .byte 0xf3,0x49,0x0f,0xb8,0xe9 - add %rbx, %rax -.Lx3: .byte 0xf3,0x49,0x0f,0xb8,0xda - add %rbp, %rcx - .byte 0xf3,0x49,0x0f,0xb8,0xeb - add %rbx, %rax - add %rbp, %rcx -.Lx2: add %rcx, %rax -.Lx1: pop %rbp +.Ltop: mov (%rdi,%rdx,8), %r8 + mov 8(%rdi,%rdx,8), %rbx + xor (%rsi,%rdx,8), %r8 + xor 8(%rsi,%rdx,8), %rbx + + mov %r8, %r9 + mov %rbx, %rbp + shr %r8 + shr %rbx + and %r10, %r8 + and %r10, %rbx + sub %r8, %r9 + sub %rbx, %rbp + + mov %r9, %r8 + mov %rbp, %rbx + shr $2, %r9 + shr $2, %rbp + and %r11, %r8 + and %r11, %r9 + and %r11, %rbx + and %r11, %rbp + add %r8, %r9 + add %rbx, %rbp + + add %rbp, %r9 +.Lmid: mov %r9, %r8 + shr $4, %r9 + and %rcx, %r8 + and %rcx, %r9 + add %r8, %r9 + + imul %r12, %r9 + shr $56, %r9 + + add %r9, %rax + add $2, %rdx + jnc .Ltop + +.Lend: + pop %r12 + pop %rbp pop %rbx ret - -.L2: mov 8(%rdi), %r11 - xor 8(%rsi), %r11 - sub $2, %rdx - jle .Ln2 - mov 16(%rdi), %r8 - mov 24(%rdi), %r9 - xor %ebx, %ebx - xor 16(%rsi), %r8 - xor 24(%rsi), %r9 - add $16, %rdi - add $16, %rsi - jmp .Le2 -.Ln2: .byte 0xf3,0x49,0x0f,0xb8,0xcb - jmp .Lx2 - -.L1: dec %rdx - jle .Lx1 - mov 8(%rdi), %r8 - mov 16(%rdi), %r9 - xor 8(%rsi), %r8 - xor 16(%rsi), %r9 - xor %ebp, %ebp - mov 24(%rdi), %r10 - mov 32(%rdi), %r11 - add $40, %rdi - add $8, %rsi - jmp .Le1 - .size __gmpn_hamdist,.-__gmpn_hamdist - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab diff --git a/ext/gmp/gen/x86_64-linux/mpn/ior_n.s b/ext/gmp/gen/x86_64-linux/mpn/ior_n.s index fc23fd7190..6509f28b3b 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/ior_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/ior_n.s @@ -90,7 +90,6 @@ - .text @@ -103,50 +102,42 @@ __gmpn_ior_n: mov (%rdx), %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: or (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: or (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx +.Lb01: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 -.Lb00: mov 8(%rdx), %r9 - or (%rsi), %r8 - or 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 -.Le10: mov 24(%rdx), %r9 - lea 32(%rdx), %rdx - or 16(%rsi), %r8 - or 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + or (%rsi,%rcx,8), %r8 + or 8(%rsi,%rcx,8), %r9 + nop + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + or 16(%rsi,%rcx,8), %r8 + or 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/iorn_n.s b/ext/gmp/gen/x86_64-linux/mpn/iorn_n.s index e13105d814..b199ca33ff 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/iorn_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/iorn_n.s @@ -92,7 +92,6 @@ - .text @@ -106,54 +105,46 @@ __gmpn_iorn_n: mov (%rdx), %r8 not %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: or (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: or (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 +.Ltop: mov (%rdx,%rcx,8), %r8 not %r8 -.Lb00: mov 8(%rdx), %r9 +.Lb00: mov 8(%rdx,%rcx,8), %r9 not %r9 - or (%rsi), %r8 - or 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 + or (%rsi,%rcx,8), %r8 + or 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 not %r8 -.Le10: mov 24(%rdx), %r9 +.Le10: mov 24(%rdx,%rcx,8), %r9 not %r9 - lea 32(%rdx), %rdx - or 16(%rsi), %r8 - or 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + or 16(%rsi,%rcx,8), %r8 + or 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/lshift.s b/ext/gmp/gen/x86_64-linux/mpn/lshift.s index ebd4035c21..89e9566e3c 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/lshift.s +++ b/ext/gmp/gen/x86_64-linux/mpn/lshift.s @@ -41,32 +41,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -89,123 +63,124 @@ .text - .align 64, 0x90 + .align 32, 0x90 .globl __gmpn_lshift .type __gmpn_lshift,@function __gmpn_lshift: - movd %ecx, %xmm4 - mov $64, %eax - sub %ecx, %eax - movd %eax, %xmm5 - - neg %ecx + neg %ecx mov -8(%rsi,%rdx,8), %rax - shr %cl, %rax - - cmp $3, %rdx - jle .Lbc + shr %cl, %rax - lea (%rdi,%rdx,8), %ecx - test $8, %cl - jz .Lrp_aligned - - - movq -8(%rsi,%rdx,8), %xmm0 - movq -16(%rsi,%rdx,8), %xmm1 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movq %xmm0, -8(%rdi,%rdx,8) - dec %rdx - -.Lrp_aligned: + neg %ecx lea 1(%rdx), %r8d - - and $6, %r8d - jz .Lba0 - cmp $4, %r8d - jz .Lba4 - jc .Lba2 -.Lba6: add $-4, %rdx - jmp .Li56 -.Lba0: add $-6, %rdx - jmp .Li70 -.Lba4: add $-2, %rdx - jmp .Li34 -.Lba2: add $-8, %rdx - jle .Lend - + and $3, %r8d + je .Lrlx + + dec %r8d + jne .L1 + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + jmp .Lrll + +.L1: dec %r8d + je .L1x + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + neg %ecx +.L1x: + cmp $1, %rdx + je .Last + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + mov -24(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + mov %r10, -8(%rdi,%rdx,8) + mov %r11, -16(%rdi,%rdx,8) + sub $2, %rdx + +.Lrll: neg %ecx +.Lrlx: mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + + sub $4, %rdx + jb .Lend .align 16, 0x90 -.Ltop: movdqu 40(%rsi,%rdx,8), %xmm1 - movdqu 48(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, 48(%rdi,%rdx,8) -.Li70: - movdqu 24(%rsi,%rdx,8), %xmm1 - movdqu 32(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, 32(%rdi,%rdx,8) -.Li56: - movdqu 8(%rsi,%rdx,8), %xmm1 - movdqu 16(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, 16(%rdi,%rdx,8) -.Li34: - movdqu -8(%rsi,%rdx,8), %xmm1 - movdqu (%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, (%rdi,%rdx,8) - sub $8, %rdx - jg .Ltop - -.Lend: test $1, %dl - jnz .Lend8 - - movdqu (%rsi), %xmm1 - pxor %xmm0, %xmm0 - punpcklqdq %xmm1, %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - movdqa %xmm0, (%rdi) +.Ltop: - ret - + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + mov 8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + mov %r10, 24(%rdi,%rdx,8) + mov %r11, 16(%rdi,%rdx,8) + + mov 0(%rsi,%rdx,8), %r8 + mov -8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + shr %cl, %r9 - .align 16, 0x90 -.Lbc: dec %edx - jz .Lend8 - - movq (%rsi,%rdx,8), %xmm1 - movq -8(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, (%rdi,%rdx,8) - sub $2, %edx - jl .Lend8 - movq 8(%rsi), %xmm1 - movq (%rsi), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, 8(%rdi) - -.Lend8:movq (%rsi), %xmm0 - psllq %xmm4, %xmm0 - movq %xmm0, (%rdi) + + neg %ecx + mov 8(%rsi,%rdx,8), %r10 + mov 0(%rsi,%rdx,8), %r11 + shl %cl, %r10 + or %r10, %r8 + shl %cl, %r11 + or %r11, %r9 + mov %r8, 8(%rdi,%rdx,8) + mov %r9, 0(%rdi,%rdx,8) + + mov -8(%rsi,%rdx,8), %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r10 + shl %cl, %r11 + + sub $4, %rdx + jae .Ltop +.Lend: + neg %ecx + mov 8(%rsi), %r8 + shr %cl, %r8 + or %r8, %r10 + mov (%rsi), %r9 + shr %cl, %r9 + or %r9, %r11 + mov %r10, 16(%rdi) + mov %r11, 8(%rdi) + + neg %ecx +.Last: mov (%rsi), %r10 + shl %cl, %r10 + mov %r10, (%rdi) ret .size __gmpn_lshift,.-__gmpn_lshift - diff --git a/ext/gmp/gen/x86_64-linux/mpn/lshiftc.s b/ext/gmp/gen/x86_64-linux/mpn/lshiftc.s index 1ed069b688..680994041a 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/lshiftc.s +++ b/ext/gmp/gen/x86_64-linux/mpn/lshiftc.s @@ -41,32 +41,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -89,134 +63,135 @@ .text - .align 64, 0x90 + .align 32, 0x90 .globl __gmpn_lshiftc .type __gmpn_lshiftc,@function __gmpn_lshiftc: - movd %ecx, %xmm4 - mov $64, %eax - sub %ecx, %eax - movd %eax, %xmm5 - - neg %ecx + neg %ecx mov -8(%rsi,%rdx,8), %rax - shr %cl, %rax - - pcmpeqb %xmm3, %xmm3 - - cmp $3, %rdx - jle .Lbc - - lea (%rdi,%rdx,8), %ecx - test $8, %cl - jz .Lrp_aligned - - - movq -8(%rsi,%rdx,8), %xmm0 - movq -16(%rsi,%rdx,8), %xmm1 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, -8(%rdi,%rdx,8) - dec %rdx + shr %cl, %rax -.Lrp_aligned: + neg %ecx lea 1(%rdx), %r8d - - and $6, %r8d - jz .Lba0 - cmp $4, %r8d - jz .Lba4 - jc .Lba2 -.Lba6: add $-4, %rdx - jmp .Li56 -.Lba0: add $-6, %rdx - jmp .Li70 -.Lba4: add $-2, %rdx - jmp .Li34 -.Lba2: add $-8, %rdx - jle .Lend - + and $3, %r8d + je .Lrlx + + dec %r8d + jne .L1 + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + not %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + jmp .Lrll + +.L1: dec %r8d + je .L1x + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + not %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + neg %ecx +.L1x: + cmp $1, %rdx + je .Last + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + mov -24(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, -8(%rdi,%rdx,8) + mov %r11, -16(%rdi,%rdx,8) + sub $2, %rdx + +.Lrll: neg %ecx +.Lrlx: mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + + sub $4, %rdx + jb .Lend .align 16, 0x90 -.Ltop: movdqu 40(%rsi,%rdx,8), %xmm1 - movdqu 48(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, 48(%rdi,%rdx,8) -.Li70: - movdqu 24(%rsi,%rdx,8), %xmm1 - movdqu 32(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, 32(%rdi,%rdx,8) -.Li56: - movdqu 8(%rsi,%rdx,8), %xmm1 - movdqu 16(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, 16(%rdi,%rdx,8) -.Li34: - movdqu -8(%rsi,%rdx,8), %xmm1 - movdqu (%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, (%rdi,%rdx,8) - sub $8, %rdx - jg .Ltop - -.Lend: test $1, %dl - jnz .Lend8 - - movdqu (%rsi), %xmm1 - pxor %xmm0, %xmm0 - punpcklqdq %xmm1, %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, (%rdi) +.Ltop: - ret - + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + mov 8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, 24(%rdi,%rdx,8) + mov %r11, 16(%rdi,%rdx,8) + + mov 0(%rsi,%rdx,8), %r8 + mov -8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + shr %cl, %r9 - .align 16, 0x90 -.Lbc: dec %edx - jz .Lend8 - - movq (%rsi,%rdx,8), %xmm1 - movq -8(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, (%rdi,%rdx,8) - sub $2, %edx - jl .Lend8 - movq 8(%rsi), %xmm1 - movq (%rsi), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, 8(%rdi) - -.Lend8:movq (%rsi), %xmm0 - psllq %xmm4, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, (%rdi) + + neg %ecx + mov 8(%rsi,%rdx,8), %r10 + mov 0(%rsi,%rdx,8), %r11 + shl %cl, %r10 + or %r10, %r8 + shl %cl, %r11 + or %r11, %r9 + not %r8 + not %r9 + mov %r8, 8(%rdi,%rdx,8) + mov %r9, 0(%rdi,%rdx,8) + + mov -8(%rsi,%rdx,8), %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r10 + shl %cl, %r11 + + sub $4, %rdx + jae .Ltop +.Lend: + neg %ecx + mov 8(%rsi), %r8 + shr %cl, %r8 + or %r8, %r10 + mov (%rsi), %r9 + shr %cl, %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, 16(%rdi) + mov %r11, 8(%rdi) + + neg %ecx +.Last: mov (%rsi), %r10 + shl %cl, %r10 + not %r10 + mov %r10, (%rdi) ret .size __gmpn_lshiftc,.-__gmpn_lshiftc - diff --git a/ext/gmp/gen/x86_64-linux/mpn/mul_1.s b/ext/gmp/gen/x86_64-linux/mpn/mul_1.s index e8de366075..1644074e4d 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/mul_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/mul_1.s @@ -66,8 +66,6 @@ - - @@ -81,120 +79,127 @@ + .text - .align 32, 0x90 + .align 16, 0x90 + .globl __gmpn_mul_1c + .type __gmpn_mul_1c,@function + +__gmpn_mul_1c: + + + + + push %rbx + mov %r8, %r10 + + jmp .Lcommon + .size __gmpn_mul_1c,.-__gmpn_mul_1c + .globl __gmpn_mul_1 .type __gmpn_mul_1,@function __gmpn_mul_1: - mov %rcx, %r10 - mov %rdx, %rcx - mov %edx, %r8d - shr $3, %rcx - and $7, %r8d - mov %r10, %rdx - lea .Ltab(%rip), %r10 - movslq (%r10,%r8,4), %r8 - lea (%r8, %r10), %r10 - jmp *%r10 - - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .Lf0-.Ltab - .long .Lf1-.Ltab - .long .Lf2-.Ltab - .long .Lf3-.Ltab - .long .Lf4-.Ltab - .long .Lf5-.Ltab - .long .Lf6-.Ltab - .long .Lf7-.Ltab - .text -.Lf0: .byte 0xc4,98,171,0xf6,6 - lea 56(%rsi), %rsi - lea -8(%rdi), %rdi - jmp .Lb0 - -.Lf3: .byte 0xc4,226,179,0xf6,6 - lea 16(%rsi), %rsi - lea 16(%rdi), %rdi - inc %rcx - jmp .Lb3 - -.Lf4: .byte 0xc4,98,171,0xf6,6 - lea 24(%rsi), %rsi - lea 24(%rdi), %rdi - inc %rcx - jmp .Lb4 - -.Lf5: .byte 0xc4,226,179,0xf6,6 - lea 32(%rsi), %rsi - lea 32(%rdi), %rdi - inc %rcx - jmp .Lb5 - -.Lf6: .byte 0xc4,98,171,0xf6,6 - lea 40(%rsi), %rsi - lea 40(%rdi), %rdi - inc %rcx - jmp .Lb6 - -.Lf7: .byte 0xc4,226,179,0xf6,6 - lea 48(%rsi), %rsi - lea 48(%rdi), %rdi - inc %rcx - jmp .Lb7 - -.Lf1: .byte 0xc4,226,179,0xf6,6 - test %rcx, %rcx - jnz .Lb1 -.L1: mov %r9, (%rdi) - ret -.Lf2: .byte 0xc4,98,171,0xf6,6 - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - .byte 0xc4,226,179,0xf6,6 - test %rcx, %rcx - jz .Lend - - .align 32, 0x90 -.Ltop: mov %r10, -8(%rdi) - adc %r8, %r9 -.Lb1: .byte 0xc4,98,171,0xf6,70,8 - adc %rax, %r10 - lea 64(%rsi), %rsi - mov %r9, (%rdi) -.Lb0: mov %r10, 8(%rdi) - .byte 0xc4,226,179,0xf6,70,208 - lea 64(%rdi), %rdi - adc %r8, %r9 -.Lb7: .byte 0xc4,98,171,0xf6,70,216 - mov %r9, -48(%rdi) - adc %rax, %r10 -.Lb6: mov %r10, -40(%rdi) - .byte 0xc4,226,179,0xf6,70,224 - adc %r8, %r9 -.Lb5: .byte 0xc4,98,171,0xf6,70,232 - mov %r9, -32(%rdi) - adc %rax, %r10 -.Lb4: .byte 0xc4,226,179,0xf6,70,240 - mov %r10, -24(%rdi) - adc %r8, %r9 -.Lb3: .byte 0xc4,98,171,0xf6,70,248 - adc %rax, %r10 - mov %r9, -16(%rdi) - dec %rcx - .byte 0xc4,226,179,0xf6,6 - jnz .Ltop - -.Lend: mov %r10, -8(%rdi) - adc %r8, %r9 - mov %r9, (%rdi) - adc %rcx, %rax + + push %rbx + xor %r10, %r10 +.Lcommon: + mov (%rsi), %rax + mov %rdx, %rbx + + mul %rcx + mov %rbx, %r11 + + add %r10, %rax + adc $0, %rdx + + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jz .Lb2 + jg .Lb3 + +.Lb1: dec %r11 + jne .Lgt1 + mov %rax, (%rdi) + jmp .Lret +.Lgt1: lea 8(%rsi,%r11,8), %rsi + lea -8(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (%rsi,%r11,8), %rax + mov %rdx, %r8 + jmp .LL1 + +.Lb0: lea (%rsi,%r11,8), %rsi + lea -16(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp .LL0 + +.Lb3: lea -8(%rsi,%r11,8), %rsi + lea -24(%rdi,%r11,8), %rdi + neg %r11 + mov %rax, %rbx + mov %rdx, %r10 + jmp .LL3 + +.Lb2: lea -16(%rsi,%r11,8), %rsi + lea -32(%rdi,%r11,8), %rdi + neg %r11 + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %rdx, %r9 + jmp .LL2 + + .align 16, 0x90 +.Ltop: mov %r10, (%rdi,%r11,8) + add %rax, %r9 + mov (%rsi,%r11,8), %rax + adc %rdx, %r8 + mov $0, %r10d +.LL1: mul %rcx + mov %r9, 8(%rdi,%r11,8) + add %rax, %r8 + adc %rdx, %rbx +.LL0: mov 8(%rsi,%r11,8), %rax + mul %rcx + mov %r8, 16(%rdi,%r11,8) + add %rax, %rbx + adc %rdx, %r10 +.LL3: mov 16(%rsi,%r11,8), %rax + mul %rcx + mov %rbx, 24(%rdi,%r11,8) + mov $0, %r8d + mov %r8, %rbx + add %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %r8, %r9 + adc %rdx, %r9 +.LL2: mul %rcx + add $4, %r11 + js .Ltop + + mov %r10, (%rdi,%r11,8) + add %rax, %r9 + adc %r8, %rdx + mov %r9, 8(%rdi,%r11,8) + add %r8, %rdx +.Lret: mov %rdx, %rax + + pop %rbx + + ret .size __gmpn_mul_1,.-__gmpn_mul_1 - diff --git a/ext/gmp/gen/x86_64-linux/mpn/mul_2.s b/ext/gmp/gen/x86_64-linux/mpn/mul_2.s index 395391597e..0c3310dfad 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/mul_2.s +++ b/ext/gmp/gen/x86_64-linux/mpn/mul_2.s @@ -81,13 +81,17 @@ + + + + .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_mul_2 .type __gmpn_mul_2,@function @@ -100,88 +104,112 @@ __gmpn_mul_2: mov (%rcx), %r8 mov 8(%rcx), %r9 - lea 3(%rdx), %r11 - shr $2, %r11 - - test $1, %dl - jnz .Lbx1 + mov (%rsi), %rax + + mov %rdx, %r11 + neg %r11 + lea -8(%rsi,%rdx,8), %rsi + lea -8(%rdi,%rdx,8), %rdi + + and $3, %edx + jz .Lm2p0 + cmp $2, %edx + jc .Lm2p1 + jz .Lm2p2 +.Lm2p3: + mul %r8 + xor %r10d, %r10d + mov %rax, %rcx + mov %rdx, %rbp + mov 8(%rsi,%r11,8), %rax + add $-1, %r11 + mul %r9 + add %rax, %rbp + jmp .Lm23 +.Lm2p0: + mul %r8 + xor %ebp, %ebp + mov %rax, %rbx + mov %rdx, %rcx + jmp .Lm20 +.Lm2p1: + mul %r8 + xor %r10d, %r10d + xor %ebx, %ebx + xor %ecx, %ecx + add $1, %r11 + jmp .Lm2top +.Lm2p2: + mul %r8 + xor %ebx, %ebx + xor %ecx, %ecx + mov %rax, %rbp + mov %rdx, %r10 + mov 8(%rsi,%r11,8), %rax + add $-2, %r11 + jmp .Lm22 -.Lbx0: xor %rbx, %rbx - test $2, %dl - mov (%rsi), %rdx - .byte 0xc4,194,211,0xf6,200 - jz .Llo0 -.Lb10: lea -16(%rdi), %rdi - lea -16(%rsi), %rsi - jmp .Llo2 - -.Lbx1: xor %rbp, %rbp - test $2, %dl - mov (%rsi), %rdx - .byte 0xc4,66,227,0xf6,208 - jnz .Lb11 - -.Lb01: lea -24(%rdi), %rdi - lea 8(%rsi), %rsi - jmp .Llo1 - -.Lb11: lea -8(%rdi), %rdi - lea -8(%rsi), %rsi - jmp .Llo3 - - .align 16, 0x90 -.Ltop: .byte 0xc4,194,251,0xf6,217 - add %rax, %rbp - mov (%rsi), %rdx - .byte 0xc4,194,251,0xf6,200 - adc $0, %rbx - add %rax, %rbp - adc $0, %rcx - add %r10, %rbp -.Llo0: mov %rbp, (%rdi) - adc $0, %rcx - .byte 0xc4,194,251,0xf6,233 - add %rax, %rbx - mov 8(%rsi), %rdx - adc $0, %rbp - .byte 0xc4,66,251,0xf6,208 - add %rax, %rbx - adc $0, %r10 - add %rcx, %rbx -.Llo3: mov %rbx, 8(%rdi) - adc $0, %r10 - .byte 0xc4,194,251,0xf6,217 - add %rax, %rbp - mov 16(%rsi), %rdx - .byte 0xc4,194,251,0xf6,200 - adc $0, %rbx - add %rax, %rbp - adc $0, %rcx - add %r10, %rbp -.Llo2: mov %rbp, 16(%rdi) - adc $0, %rcx - .byte 0xc4,194,251,0xf6,233 - add %rax, %rbx - mov 24(%rsi), %rdx - adc $0, %rbp - .byte 0xc4,66,251,0xf6,208 - add %rax, %rbx - adc $0, %r10 - add %rcx, %rbx - lea 32(%rsi), %rsi -.Llo1: mov %rbx, 24(%rdi) - adc $0, %r10 - dec %r11 - lea 32(%rdi), %rdi - jnz .Ltop - -.Lend: .byte 0xc4,194,235,0xf6,193 - add %rdx, %rbp - adc $0, %rax - add %r10, %rbp - mov %rbp, (%rdi) - adc $0, %rax + .align 32, 0x90 +.Lm2top: + add %rax, %r10 + adc %rdx, %rbx + mov 0(%rsi,%r11,8), %rax + adc $0, %ecx + mov $0, %ebp + mul %r9 + add %rax, %rbx + mov %r10, 0(%rdi,%r11,8) + adc %rdx, %rcx + mov 8(%rsi,%r11,8), %rax + mul %r8 + add %rax, %rbx + adc %rdx, %rcx + adc $0, %ebp +.Lm20: mov 8(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rcx + adc %rdx, %rbp + mov 16(%rsi,%r11,8), %rax + mov $0, %r10d + mul %r8 + add %rax, %rcx + mov 16(%rsi,%r11,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r9 + add %rax, %rbp + mov %rbx, 8(%rdi,%r11,8) +.Lm23: adc %rdx, %r10 + mov 24(%rsi,%r11,8), %rax + mul %r8 + mov $0, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov %rcx, 16(%rdi,%r11,8) + mov 24(%rsi,%r11,8), %rax + mov $0, %ecx + adc $0, %ebx +.Lm22: mul %r9 + add %rax, %r10 + mov %rbp, 24(%rdi,%r11,8) + adc %rdx, %rbx + mov 32(%rsi,%r11,8), %rax + mul %r8 + add $4, %r11 + js .Lm2top + + + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + mov (%rsi), %rax + mul %r9 + mov %r10, (%rdi) + add %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%rdi) + mov %rcx, %rax pop %rbp pop %rbx diff --git a/ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s b/ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s index 498782526f..2cfb7aaa17 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s +++ b/ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s @@ -84,295 +84,400 @@ + .text + .align 16, 0x90 + .globl __gmpn_mul_basecase + .type __gmpn_mul_basecase,@function + +__gmpn_mul_basecase: + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + xor %r13d, %r13d + mov (%rsi), %rax + mov (%rcx), %r12 + sub %rdx, %r13 + mov %r13, %r11 + mov %edx, %ebx + lea (%rdi,%rdx,8), %rdi + lea (%rsi,%rdx,8), %rsi + mul %r12 + test $1, %r8b + jz .Lmul_2 - .text - .align 16, 0x90 - .globl __gmpn_mul_basecase - .type __gmpn_mul_basecase,@function - -__gmpn_mul_basecase: - +.Lmul_1: + and $3, %ebx + jz .Lmul_1_prologue_0 + cmp $2, %ebx + jc .Lmul_1_prologue_1 + jz .Lmul_1_prologue_2 - cmp $2, %rdx - ja .Lgen - mov (%rcx), %rdx - .byte 0xc4,98,251,0xf6,14 - je .Ls2x +.Lmul_1_prologue_3: + add $-1, %r11 + lea .Laddmul_outer_3(%rip), %r14 + mov %rax, %r10 + mov %rdx, %rbx + jmp .Lmul_1_entry_3 -.Ls11: mov %rax, (%rdi) - mov %r9, 8(%rdi) - - ret +.Lmul_1_prologue_0: + mov %rax, %rbp + mov %rdx, %r10 + lea .Laddmul_outer_0(%rip), %r14 + jmp .Lmul_1_entry_0 + +.Lmul_1_prologue_1: + cmp $-1, %r13 + jne 2f + mov %rax, -8(%rdi) + mov %rdx, (%rdi) + jmp .Lret +2: add $1, %r11 + lea .Laddmul_outer_1(%rip), %r14 + mov %rax, %r15 + mov %rdx, %rbp + xor %r10d, %r10d + mov (%rsi,%r11,8), %rax + jmp .Lmul_1_entry_1 + +.Lmul_1_prologue_2: + add $-2, %r11 + lea .Laddmul_outer_2(%rip), %r14 + mov %rax, %rbx + mov %rdx, %r15 + mov 24(%rsi,%r11,8), %rax + xor %ebp, %ebp + xor %r10d, %r10d + jmp .Lmul_1_entry_2 -.Ls2x: cmp $2, %r8 - .byte 0xc4,98,187,0xf6,86,8 - je .Ls22 -.Ls21: add %r8, %r9 - adc $0, %r10 - mov %rax, (%rdi) - mov %r9, 8(%rdi) - mov %r10, 16(%rdi) - ret -.Ls22: add %r8, %r9 - adc $0, %r10 - mov 8(%rcx), %rdx - mov %rax, (%rdi) - .byte 0xc4,98,187,0xf6,30 - .byte 0xc4,226,251,0xf6,86,8 - add %r11, %rax - adc $0, %rdx - add %r8, %r9 - adc %rax, %r10 - adc $0, %rdx - mov %r9, 8(%rdi) - mov %r10, 16(%rdi) - mov %rdx, 24(%rdi) + .align 16, 0x90 +.Lmul_1_top: + mov %rbx, -16(%rdi,%r11,8) + add %rax, %r15 + mov (%rsi,%r11,8), %rax + adc %rdx, %rbp +.Lmul_1_entry_1: + xor %ebx, %ebx + mul %r12 + mov %r15, -8(%rdi,%r11,8) + add %rax, %rbp + adc %rdx, %r10 +.Lmul_1_entry_0: + mov 8(%rsi,%r11,8), %rax + mul %r12 + mov %rbp, (%rdi,%r11,8) + add %rax, %r10 + adc %rdx, %rbx +.Lmul_1_entry_3: + mov 16(%rsi,%r11,8), %rax + mul %r12 + mov %r10, 8(%rdi,%r11,8) + xor %ebp, %ebp + mov %rbp, %r10 + add %rax, %rbx + mov 24(%rsi,%r11,8), %rax + mov %rbp, %r15 + adc %rdx, %r15 +.Lmul_1_entry_2: + mul %r12 + add $4, %r11 + js .Lmul_1_top + + mov %rbx, -16(%rdi) + add %rax, %r15 + mov %r15, -8(%rdi) + adc %rdx, %rbp + mov %rbp, (%rdi) + + add $-1, %r8 + jz .Lret + + mov 8(%rcx), %r12 + mov 16(%rcx), %r9 + + lea 8(%rcx), %rcx + lea 8(%rdi), %rdi + + jmp *%r14 + + + + + .align 16, 0x90 +.Lmul_2: + mov 8(%rcx), %r9 + + and $3, %ebx + jz .Lmul_2_prologue_0 + cmp $2, %ebx + jz .Lmul_2_prologue_2 + jc .Lmul_2_prologue_1 + +.Lmul_2_prologue_3: + lea .Laddmul_outer_3(%rip), %r14 + add $2, %r11 + mov %rax, -16(%rdi,%r11,8) + mov %rdx, %rbp + xor %r10d, %r10d + xor %ebx, %ebx + mov -16(%rsi,%r11,8), %rax + jmp .Lmul_2_entry_3 + + .align 16, 0x90 +.Lmul_2_prologue_0: + add $3, %r11 + mov %rax, %rbx + mov %rdx, %r15 + xor %ebp, %ebp + mov -24(%rsi,%r11,8), %rax + lea .Laddmul_outer_0(%rip), %r14 + jmp .Lmul_2_entry_0 + + .align 16, 0x90 +.Lmul_2_prologue_1: + mov %rax, %r10 + mov %rdx, %rbx + xor %r15d, %r15d + lea .Laddmul_outer_1(%rip), %r14 + jmp .Lmul_2_entry_1 + + .align 16, 0x90 +.Lmul_2_prologue_2: + add $1, %r11 + lea .Laddmul_outer_2(%rip), %r14 + mov $0, %ebx + mov $0, %r15d + mov %rax, %rbp + mov -8(%rsi,%r11,8), %rax + mov %rdx, %r10 + jmp .Lmul_2_entry_2 + - ret .align 16, 0x90 -.Lgen: - push %rbx - push %rbp - push %r12 - push %r14 +.Lmul_2_top: + mov -32(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rbx + adc %rdx, %r15 + mov -24(%rsi,%r11,8), %rax + xor %ebp, %ebp + mul %r12 + add %rax, %rbx + mov -24(%rsi,%r11,8), %rax + adc %rdx, %r15 + adc $0, %ebp +.Lmul_2_entry_0: + mul %r9 + add %rax, %r15 + mov %rbx, -24(%rdi,%r11,8) + adc %rdx, %rbp + mov -16(%rsi,%r11,8), %rax + mul %r12 + mov $0, %r10d + add %rax, %r15 + adc %rdx, %rbp + mov -16(%rsi,%r11,8), %rax + adc $0, %r10d + mov $0, %ebx + mov %r15, -16(%rdi,%r11,8) +.Lmul_2_entry_3: + mul %r9 + add %rax, %rbp + mov -8(%rsi,%r11,8), %rax + adc %rdx, %r10 + mov $0, %r15d + mul %r12 + add %rax, %rbp + mov -8(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc %r15d, %ebx +.Lmul_2_entry_2: + mul %r9 + add %rax, %r10 + mov %rbp, -8(%rdi,%r11,8) + adc %rdx, %rbx + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %r15d +.Lmul_2_entry_1: + add $4, %r11 + mov %r10, -32(%rdi,%r11,8) + js .Lmul_2_top + + mov -32(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rbx + mov %rbx, (%rdi) + adc %rdx, %r15 + mov %r15, 8(%rdi) + + add $-2, %r8 + jz .Lret + + mov 16(%rcx), %r12 + mov 24(%rcx), %r9 + + lea 16(%rcx), %rcx + lea 16(%rdi), %rdi + + jmp *%r14 - mov %rcx, %r14 - lea 1(%rdx), %rbx + + + + + + + +.Laddmul_outer_0: + add $3, %r13 + lea 0(%rip), %r14 + + mov %r13, %r11 + mov -24(%rsi,%r13,8), %rax + mul %r12 + mov %rax, %rbx + mov -24(%rsi,%r13,8), %rax + mov %rdx, %r15 + xor %ebp, %ebp + jmp .Laddmul_entry_0 + +.Laddmul_outer_1: + mov %r13, %r11 + mov (%rsi,%r13,8), %rax + mul %r12 + mov %rax, %r10 + mov (%rsi,%r13,8), %rax + mov %rdx, %rbx + xor %r15d, %r15d + jmp .Laddmul_entry_1 + +.Laddmul_outer_2: + add $1, %r13 + lea 0(%rip), %r14 + + mov %r13, %r11 + mov -8(%rsi,%r13,8), %rax + mul %r12 + xor %ebx, %ebx + mov %rax, %rbp + xor %r15d, %r15d + mov %rdx, %r10 + mov -8(%rsi,%r13,8), %rax + jmp .Laddmul_entry_2 + +.Laddmul_outer_3: + add $2, %r13 + lea 0(%rip), %r14 + + mov %r13, %r11 + mov -16(%rsi,%r13,8), %rax + xor %r10d, %r10d + mul %r12 + mov %rax, %r15 + mov -16(%rsi,%r13,8), %rax mov %rdx, %rbp - mov %edx, %eax - and $-8, %rbx - shr $3, %rbp - neg %rbx - and $7, %eax - - mov %rbp, %rcx - mov (%r14), %rdx - lea 8(%r14), %r14 - - lea .Lmtab(%rip), %r10 - movslq (%r10,%rax,4), %r11 - lea (%r11, %r10), %r10 - jmp *%r10 - - -.Lmf0: .byte 0xc4,98,171,0xf6,30 - lea 56(%rsi), %rsi - lea -8(%rdi), %rdi - jmp .Lmb0 - -.Lmf3: .byte 0xc4,98,155,0xf6,14 - lea 16(%rsi), %rsi - lea 16(%rdi), %rdi - inc %rcx - jmp .Lmb3 - -.Lmf4: .byte 0xc4,98,171,0xf6,30 - lea 24(%rsi), %rsi - lea 24(%rdi), %rdi - inc %rcx - jmp .Lmb4 - -.Lmf5: .byte 0xc4,98,155,0xf6,14 - lea 32(%rsi), %rsi - lea 32(%rdi), %rdi - inc %rcx - jmp .Lmb5 - -.Lmf6: .byte 0xc4,98,171,0xf6,30 - lea 40(%rsi), %rsi - lea 40(%rdi), %rdi - inc %rcx - jmp .Lmb6 - -.Lmf7: .byte 0xc4,98,155,0xf6,14 - lea 48(%rsi), %rsi - lea 48(%rdi), %rdi - inc %rcx - jmp .Lmb7 - -.Lmf1: .byte 0xc4,98,155,0xf6,14 - jmp .Lmb1 - -.Lmf2: .byte 0xc4,98,171,0xf6,30 - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - .byte 0xc4,98,155,0xf6,14 + jmp .Laddmul_entry_3 + + + + .align 16, 0x90 +.Laddmul_top: + add %r10, -32(%rdi,%r11,8) + adc %rax, %rbx + mov -24(%rsi,%r11,8), %rax + adc %rdx, %r15 + xor %ebp, %ebp + mul %r12 + add %rax, %rbx + mov -24(%rsi,%r11,8), %rax + adc %rdx, %r15 + adc %ebp, %ebp +.Laddmul_entry_0: + mul %r9 + xor %r10d, %r10d + add %rbx, -24(%rdi,%r11,8) + adc %rax, %r15 + mov -16(%rsi,%r11,8), %rax + adc %rdx, %rbp + mul %r12 + add %rax, %r15 + mov -16(%rsi,%r11,8), %rax + adc %rdx, %rbp + adc $0, %r10d +.Laddmul_entry_3: + mul %r9 + add %r15, -16(%rdi,%r11,8) + adc %rax, %rbp + mov -8(%rsi,%r11,8), %rax + adc %rdx, %r10 + mul %r12 + xor %ebx, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %r15d + mov -8(%rsi,%r11,8), %rax + adc %r15d, %ebx +.Laddmul_entry_2: + mul %r9 + add %rbp, -8(%rdi,%r11,8) + adc %rax, %r10 + adc %rdx, %rbx + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + mov (%rsi,%r11,8), %rax + adc %rdx, %rbx + adc $0, %r15d +.Laddmul_entry_1: + mul %r9 + add $4, %r11 + js .Laddmul_top + + add %r10, -8(%rdi) + adc %rax, %rbx + mov %rbx, (%rdi) + adc %rdx, %r15 + mov %r15, 8(%rdi) + + add $-2, %r8 + jz .Lret + + lea 16(%rdi), %rdi + lea 16(%rcx), %rcx + + mov (%rcx), %r12 + mov 8(%rcx), %r9 + + jmp *%r14 .align 16, 0x90 -.Lm1top: - mov %r10, -8(%rdi) - adc %r11, %r12 -.Lmb1: .byte 0xc4,98,171,0xf6,94,8 - adc %r9, %r10 - lea 64(%rsi), %rsi - mov %r12, (%rdi) -.Lmb0: mov %r10, 8(%rdi) - .byte 0xc4,98,155,0xf6,78,208 - lea 64(%rdi), %rdi - adc %r11, %r12 -.Lmb7: .byte 0xc4,98,171,0xf6,94,216 - mov %r12, -48(%rdi) - adc %r9, %r10 -.Lmb6: mov %r10, -40(%rdi) - .byte 0xc4,98,155,0xf6,78,224 - adc %r11, %r12 -.Lmb5: .byte 0xc4,98,171,0xf6,94,232 - mov %r12, -32(%rdi) - adc %r9, %r10 -.Lmb4: .byte 0xc4,98,155,0xf6,78,240 - mov %r10, -24(%rdi) - adc %r11, %r12 -.Lmb3: .byte 0xc4,98,171,0xf6,94,248 - adc %r9, %r10 - mov %r12, -16(%rdi) - dec %rcx - .byte 0xc4,98,155,0xf6,14 - jnz .Lm1top - -.Lm1end: - mov %r10, -8(%rdi) - adc %r11, %r12 - mov %r12, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - - dec %r8 - jz .Ldone - - lea .Latab(%rip), %r10 - movslq (%r10,%rax,4), %rax - lea (%rax, %r10), %rax - - -.Louter: - lea (%rsi,%rbx,8), %rsi - mov %rbp, %rcx - mov (%r14), %rdx - lea 8(%r14), %r14 - jmp *%rax - -.Lf0: .byte 0xc4,98,171,0xf6,94,8 - lea 8(%rdi,%rbx,8), %rdi - lea -1(%rcx), %rcx - jmp .Lb0 - -.Lf3: .byte 0xc4,98,155,0xf6,78,240 - lea -56(%rdi,%rbx,8), %rdi - jmp .Lb3 - -.Lf4: .byte 0xc4,98,171,0xf6,94,232 - lea -56(%rdi,%rbx,8), %rdi - jmp .Lb4 - -.Lf5: .byte 0xc4,98,155,0xf6,78,224 - lea -56(%rdi,%rbx,8), %rdi - jmp .Lb5 - -.Lf6: .byte 0xc4,98,171,0xf6,94,216 - lea -56(%rdi,%rbx,8), %rdi - jmp .Lb6 - -.Lf7: .byte 0xc4,98,155,0xf6,78,16 - lea 8(%rdi,%rbx,8), %rdi - jmp .Lb7 - -.Lf1: .byte 0xc4,98,155,0xf6,14 - lea 8(%rdi,%rbx,8), %rdi - jmp .Lb1 - -.Lam1end: - .byte 0xf3,76,0x0f,0x38,0xf6,39 - .byte 0xf3,76,0x0f,0x38,0xf6,201 - mov %r12, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - - dec %r8 - jnz .Louter -.Ldone: +.Lret: pop %r15 pop %r14 + pop %r13 pop %r12 pop %rbp pop %rbx ret -.Lf2: - .byte 0xc4,98,171,0xf6,94,248 - lea 8(%rdi,%rbx,8), %rdi - .byte 0xc4,98,155,0xf6,14 - - .align 16, 0x90 -.Lam1top: - .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,227 - mov %r10, -8(%rdi) - jrcxz .Lam1end -.Lb1: .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,39 - lea -1(%rcx), %rcx - mov %r12, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 -.Lb0: .byte 0xc4,98,155,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,227 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) -.Lb7: .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,103,16 - mov %r12, 16(%rdi) -.Lb6: .byte 0xc4,98,155,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,227 - mov %r10, 24(%rdi) -.Lb5: .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,103,32 - mov %r12, 32(%rdi) -.Lb4: .byte 0xc4,98,155,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,227 - mov %r10, 40(%rdi) -.Lb3: .byte 0xf3,76,0x0f,0x38,0xf6,103,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r12, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,155,0xf6,14 - jmp .Lam1top - - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Lmtab:.long .Lmf0-.Lmtab - .long .Lmf1-.Lmtab - .long .Lmf2-.Lmtab - .long .Lmf3-.Lmtab - .long .Lmf4-.Lmtab - .long .Lmf5-.Lmtab - .long .Lmf6-.Lmtab - .long .Lmf7-.Lmtab -.Latab:.long .Lf0-.Latab - .long .Lf1-.Latab - .long .Lf2-.Latab - .long .Lf3-.Latab - .long .Lf4-.Latab - .long .Lf5-.Latab - .long .Lf6-.Latab - .long .Lf7-.Latab - .text .size __gmpn_mul_basecase,.-__gmpn_mul_basecase diff --git a/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s b/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s index 81d8b64e47..d76272ca92 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s +++ b/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s @@ -55,6 +55,16 @@ + + + + + + + + + + @@ -67,340 +77,363 @@ .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_mullo_basecase .type __gmpn_mullo_basecase,@function __gmpn_mullo_basecase: - cmp $4, %ecx - jae .Lbig + cmp $4, %rcx + jge .Lgen + mov (%rsi), %rax + mov (%rdx), %r8 - mov %rdx, %r11 - mov (%rsi), %rdx + lea .Ltab(%rip), %r9 + movslq (%r9,%rcx,4), %r10 + add %r10, %r9 + jmp *%r9 + + .section .data.rel.ro.local,"a",@progbits + .align 8, 0x90 +.Ltab: .long .Ltab-.Ltab + .long .L1-.Ltab + .long .L2-.Ltab + .long .L3-.Ltab + .text - cmp $2, %ecx - jae .Lgt1 -.Ln1: imul (%r11), %rdx - mov %rdx, (%rdi) +.L1: imul %r8, %rax + mov %rax, (%rdi) ret -.Lgt1: ja .Lgt2 -.Ln2: mov (%r11), %r9 - .byte 0xc4,194,251,0xf6,209 + +.L2: mov 8(%rdx), %r11 + imul %rax, %r11 + mul %r8 mov %rax, (%rdi) - mov 8(%rsi), %rax - imul %r9, %rax - add %rax, %rdx - mov 8(%r11), %r9 - mov (%rsi), %rcx - imul %r9, %rcx - add %rcx, %rdx - mov %rdx, 8(%rdi) + imul 8(%rsi), %r8 + lea (%r11, %rdx), %rax + add %r8, %rax + mov %rax, 8(%rdi) ret -.Lgt2: -.Ln3: mov (%r11), %r9 - .byte 0xc4,66,251,0xf6,209 - mov %rax, (%rdi) - mov 8(%rsi), %rdx - .byte 0xc4,194,251,0xf6,209 - imul 16(%rsi), %r9 - add %rax, %r10 - adc %rdx, %r9 - mov 8(%r11), %r8 - mov (%rsi), %rdx - .byte 0xc4,194,251,0xf6,208 - add %rax, %r10 + +.L3: mov 8(%rdx), %r9 + mov 16(%rdx), %r11 + mul %r8 + mov %rax, (%rdi) + mov (%rsi), %rax + mov %rdx, %rcx + mul %r9 + imul 8(%rsi), %r9 + mov 16(%rsi), %r10 + imul %r8, %r10 + add %rax, %rcx adc %rdx, %r9 - imul 8(%rsi), %r8 - add %r8, %r9 - mov %r10, 8(%rdi) - mov 16(%r11), %r10 - mov (%rsi), %rax - imul %rax, %r10 add %r10, %r9 + mov 8(%rsi), %rax + mul %r8 + add %rax, %rcx + adc %rdx, %r9 + mov %r11, %rax + imul (%rsi), %rax + add %rax, %r9 + mov %rcx, 8(%rdi) mov %r9, 16(%rdi) ret - .align 16, 0x90 -.Lbig: push %r14 - push %r12 - push %rbx +.L0m4: +.L1m4: +.L2m4: +.L3m4: +.Lgen: push %rbx push %rbp - mov -8(%rdx,%rcx,8), %r14 - imul (%rsi), %r14 - lea -3(%rcx), %ebp - lea 8(%rdx), %r11 - mov (%rdx), %rdx - - mov %ecx, %eax - shr $3, %ecx - and $7, %eax - lea .Lmtab(%rip), %r10 - movslq (%r10,%rax,4), %rax - lea (%rax, %r10), %r10 - jmp *%r10 - - -.Lmf0: .byte 0xc4,98,171,0xf6,6 - lea 56(%rsi), %rsi + push %r13 + push %r14 + push %r15 + + mov (%rsi), %rax + mov (%rdx), %r13 + mov %rdx, %r11 + + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + neg %rcx + + mul %r13 + + test $1, %cl + jz .Lmul_2 + +.Lmul_1: lea -8(%rdi), %rdi - lea .Lf7(%rip), %rbx - jmp .Lmb0 + lea -8(%rsi), %rsi + test $2, %cl + jnz .Lmul_1_prologue_3 + +.Lmul_1_prologue_2: + lea -1(%rcx), %r9 + lea .Laddmul_outer_1(%rip), %r8 + mov %rax, %rbx + mov %rdx, %r15 + xor %ebp, %ebp + xor %r10d, %r10d + mov 16(%rsi,%rcx,8), %rax + jmp .Lmul_1_entry_2 + +.Lmul_1_prologue_3: + lea 1(%rcx), %r9 + lea .Laddmul_outer_3(%rip), %r8 + mov %rax, %rbp + mov %rdx, %r10 + xor %ebx, %ebx + jmp .Lmul_1_entry_0 + + .align 16, 0x90 +.Lmul_1_top: + mov %rbx, -16(%rdi,%r9,8) + add %rax, %r15 + mov (%rsi,%r9,8), %rax + adc %rdx, %rbp + xor %ebx, %ebx + mul %r13 + mov %r15, -8(%rdi,%r9,8) + add %rax, %rbp + adc %rdx, %r10 +.Lmul_1_entry_0: + mov 8(%rsi,%r9,8), %rax + mul %r13 + mov %rbp, (%rdi,%r9,8) + add %rax, %r10 + adc %rdx, %rbx + mov 16(%rsi,%r9,8), %rax + mul %r13 + mov %r10, 8(%rdi,%r9,8) + xor %ebp, %ebp + mov %rbp, %r10 + add %rax, %rbx + mov 24(%rsi,%r9,8), %rax + mov %rbp, %r15 + adc %rdx, %r15 +.Lmul_1_entry_2: + mul %r13 + add $4, %r9 + js .Lmul_1_top + + mov %rbx, -16(%rdi) + add %rax, %r15 + mov %r15, -8(%rdi) + adc %rdx, %rbp + + imul (%rsi), %r13 + add %r13, %rbp + mov %rbp, (%rdi) + + add $1, %rcx + jz .Lret + + mov 8(%r11), %r13 + mov 16(%r11), %r14 -.Lmf3: .byte 0xc4,226,179,0xf6,6 lea 16(%rsi), %rsi - lea 16(%rdi), %rdi - jrcxz .Lmc - inc %ecx - lea .Lf2(%rip), %rbx - jmp .Lmb3 + lea 8(%r11), %r11 + lea 24(%rdi), %rdi + + jmp *%r8 -.Lmc: .byte 0xc4,98,171,0xf6,70,248 + +.Lmul_2: + mov 8(%r11), %r14 + test $2, %cl + jz .Lmul_2_prologue_3 + + .align 16, 0x90 +.Lmul_2_prologue_1: + lea 0(%rcx), %r9 + mov %rax, %r10 + mov %rdx, %rbx + xor %r15d, %r15d + mov (%rsi,%rcx,8), %rax + lea .Laddmul_outer_3(%rip), %r8 + jmp .Lmul_2_entry_1 + + .align 16, 0x90 +.Lmul_2_prologue_3: + lea 2(%rcx), %r9 + mov $0, %r10d + mov %rax, %r15 + mov (%rsi,%rcx,8), %rax + mov %rdx, %rbp + lea .Laddmul_outer_1(%rip), %r8 + jmp .Lmul_2_entry_3 + + .align 16, 0x90 +.Lmul_2_top: + mov -32(%rsi,%r9,8), %rax + mul %r14 + add %rax, %rbx + adc %rdx, %r15 + mov -24(%rsi,%r9,8), %rax + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %r15 + adc $0, %ebp + mul %r14 + add %rax, %r15 + mov %rbx, -24(%rdi,%r9,8) + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + mul %r13 + mov $0, %r10d + add %rax, %r15 + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + adc $0, %r10d +.Lmul_2_entry_3: + mov $0, %ebx + mov %r15, -16(%rdi,%r9,8) + mul %r14 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mov $0, %r15d + mul %r13 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + adc %r15d, %ebx + mul %r14 add %rax, %r10 - mov %r9, -16(%rdi) - .byte 0xc4,226,179,0xf6,6 - mov %r10, -8(%rdi) - adc %r8, %r9 - mov %r9, (%rdi) - jmp .Lc2 - -.Lmf4: .byte 0xc4,98,171,0xf6,6 - lea 24(%rsi), %rsi - lea 24(%rdi), %rdi - inc %ecx - lea .Lf3(%rip), %rbx - jmp .Lmb4 - -.Lmf5: .byte 0xc4,226,179,0xf6,6 - lea 32(%rsi), %rsi - lea 32(%rdi), %rdi - inc %ecx - lea .Lf4(%rip), %rbx - jmp .Lmb5 - -.Lmf6: .byte 0xc4,98,171,0xf6,6 - lea 40(%rsi), %rsi - lea 40(%rdi), %rdi - inc %ecx - lea .Lf5(%rip), %rbx - jmp .Lmb6 - -.Lmf7: .byte 0xc4,226,179,0xf6,6 - lea 48(%rsi), %rsi - lea 48(%rdi), %rdi - lea .Lf6(%rip), %rbx - jmp .Lmb7 - -.Lmf1: .byte 0xc4,226,179,0xf6,6 - lea .Lf0(%rip), %rbx - jmp .Lmb1 - -.Lmf2: .byte 0xc4,98,171,0xf6,6 - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - lea .Lf1(%rip), %rbx - .byte 0xc4,226,179,0xf6,6 - - - .align 32, 0x90 -.Lmtop:mov %r10, -8(%rdi) - adc %r8, %r9 -.Lmb1: .byte 0xc4,98,171,0xf6,70,8 - adc %rax, %r10 - lea 64(%rsi), %rsi - mov %r9, (%rdi) -.Lmb0: mov %r10, 8(%rdi) - .byte 0xc4,226,179,0xf6,70,208 - lea 64(%rdi), %rdi - adc %r8, %r9 -.Lmb7: .byte 0xc4,98,171,0xf6,70,216 - mov %r9, -48(%rdi) - adc %rax, %r10 -.Lmb6: mov %r10, -40(%rdi) - .byte 0xc4,226,179,0xf6,70,224 - adc %r8, %r9 -.Lmb5: .byte 0xc4,98,171,0xf6,70,232 - mov %r9, -32(%rdi) - adc %rax, %r10 -.Lmb4: .byte 0xc4,226,179,0xf6,70,240 - mov %r10, -24(%rdi) - adc %r8, %r9 -.Lmb3: .byte 0xc4,98,171,0xf6,70,248 + mov %rbp, -8(%rdi,%r9,8) + adc %rdx, %rbx + mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %r15d +.Lmul_2_entry_1: + add $4, %r9 + mov %r10, -32(%rdi,%r9,8) + js .Lmul_2_top + + imul -16(%rsi), %r14 + add %r14, %rbx + imul -8(%rsi), %r13 + add %r13, %rbx + mov %rbx, -8(%rdi) + + add $2, %rcx + jz .Lret + + mov 16(%r11), %r13 + mov 24(%r11), %r14 + + lea 16(%r11), %r11 + lea 16(%rdi), %rdi + + jmp *%r8 + + +.Laddmul_outer_1: + lea -2(%rcx), %r9 + mov -16(%rsi,%rcx,8), %rax + mul %r13 + mov %rax, %r10 + mov -16(%rsi,%rcx,8), %rax + mov %rdx, %rbx + xor %r15d, %r15d + lea .Laddmul_outer_3(%rip), %r8 + jmp .Laddmul_entry_1 + +.Laddmul_outer_3: + lea 0(%rcx), %r9 + mov -16(%rsi,%rcx,8), %rax + xor %r10d, %r10d + mul %r13 + mov %rax, %r15 + mov -16(%rsi,%rcx,8), %rax + mov %rdx, %rbp + lea .Laddmul_outer_1(%rip), %r8 + jmp .Laddmul_entry_3 + + .align 16, 0x90 +.Laddmul_top: + add %r10, -32(%rdi,%r9,8) + adc %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %r15 + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %r15 + adc %ebp, %ebp + mul %r14 + xor %r10d, %r10d + add %rbx, -24(%rdi,%r9,8) + adc %rax, %r15 + mov -16(%rsi,%r9,8), %rax + adc %rdx, %rbp + mul %r13 + add %rax, %r15 + mov -16(%rsi,%r9,8), %rax + adc %rdx, %rbp + adc $0, %r10d +.Laddmul_entry_3: + mul %r14 + add %r15, -16(%rdi,%r9,8) + adc %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mul %r13 + xor %ebx, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %r15d + mov -8(%rsi,%r9,8), %rax + adc %r15d, %ebx + mul %r14 + add %rbp, -8(%rdi,%r9,8) adc %rax, %r10 - mov %r9, -16(%rdi) - dec %ecx - .byte 0xc4,226,179,0xf6,6 - jnz .Lmtop - -.Lmend:mov %r10, -8(%rdi) - adc %r8, %r9 - mov %r9, (%rdi) - adc %rcx, %rax - - lea 8(,%rbp,8), %r12 - neg %r12 - shr $3, %ebp - jmp .Lent - -.Lf0: .byte 0xc4,98,171,0xf6,6 - lea -8(%rsi), %rsi - lea -8(%rdi), %rdi - lea .Lf7(%rip), %rbx - jmp .Lb0 - -.Lf1: .byte 0xc4,226,179,0xf6,6 - lea -1(%rbp), %ebp - lea .Lf0(%rip), %rbx - jmp .Lb1 - -.Lend: .byte 0xf3,76,0x0f,0x38,0xf6,15 - mov %r9, (%rdi) - .byte 0xf3,72,0x0f,0x38,0xf6,193 - adc %rcx, %rax - lea 8(%r12), %r12 -.Lent: .byte 0xc4,98,171,0xf6,70,8 - add %rax, %r14 - add %r10, %r14 - lea (%rsi,%r12), %rsi - lea 8(%rdi,%r12), %rdi - mov (%r11), %rdx - lea 8(%r11), %r11 - or %ebp, %ecx - jmp *%rbx + adc %rdx, %rbx + mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + mov (%rsi,%r9,8), %rax + adc %rdx, %rbx + adc $0, %r15d +.Laddmul_entry_1: + mul %r14 + add $4, %r9 + js .Laddmul_top + + add %r10, -32(%rdi) + adc %rax, %rbx + + imul -24(%rsi), %r13 + add %r13, %rbx + add %rbx, -24(%rdi) + + add $2, %rcx + jns .Lret + + lea 16(%r11), %r11 + + mov (%r11), %r13 + mov 8(%r11), %r14 -.Lf7: .byte 0xc4,226,179,0xf6,6 lea -16(%rsi), %rsi - lea -16(%rdi), %rdi - lea .Lf6(%rip), %rbx - jmp .Lb7 - -.Lf2: .byte 0xc4,98,171,0xf6,6 - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - .byte 0xc4,226,179,0xf6,6 - lea .Lf1(%rip), %rbx - - - .align 32, 0x90 -.Ltop: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, -8(%rdi) - jrcxz .Lend -.Lb1: .byte 0xc4,98,171,0xf6,70,8 - .byte 0xf3,76,0x0f,0x38,0xf6,15 - lea -1(%rcx), %ecx - mov %r9, (%rdi) - .byte 0x66,76,0x0f,0x38,0xf6,208 -.Lb0: .byte 0xc4,226,179,0xf6,70,16 - .byte 0x66,77,0x0f,0x38,0xf6,200 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) -.Lb7: .byte 0xc4,98,171,0xf6,70,24 - lea 64(%rsi), %rsi - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xf3,76,0x0f,0x38,0xf6,79,16 - mov %r9, 16(%rdi) -.Lb6: .byte 0xc4,226,179,0xf6,70,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, 24(%rdi) -.Lb5: .byte 0xc4,98,171,0xf6,70,232 - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xf3,76,0x0f,0x38,0xf6,79,32 - mov %r9, 32(%rdi) -.Lb4: .byte 0xc4,226,179,0xf6,70,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, 40(%rdi) -.Lb3: .byte 0xf3,76,0x0f,0x38,0xf6,79,48 - .byte 0xc4,98,171,0xf6,70,248 - mov %r9, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xc4,226,179,0xf6,6 - jmp .Ltop - -.Lf6: .byte 0xc4,98,171,0xf6,6 - lea 40(%rsi), %rsi - lea -24(%rdi), %rdi - lea .Lf5(%rip), %rbx - jmp .Lb6 - -.Lf5: .byte 0xc4,226,179,0xf6,6 - lea 32(%rsi), %rsi - lea -32(%rdi), %rdi - lea .Lf4(%rip), %rbx - jmp .Lb5 - -.Lf4: .byte 0xc4,98,171,0xf6,6 - lea 24(%rsi), %rsi - lea -40(%rdi), %rdi - lea .Lf3(%rip), %rbx - jmp .Lb4 - -.Lf3: .byte 0xc4,226,179,0xf6,6 - lea 16(%rsi), %rsi - lea -48(%rdi), %rdi - jrcxz .Lcor - lea .Lf2(%rip), %rbx - jmp .Lb3 - -.Lcor: .byte 0xf3,76,0x0f,0x38,0xf6,79,48 - .byte 0xc4,98,171,0xf6,70,248 - mov %r9, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xc4,226,179,0xf6,6 - .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, -8(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,15 - mov %r9, (%rdi) - .byte 0xf3,72,0x0f,0x38,0xf6,193 -.Lc2: - .byte 0xc4,98,171,0xf6,70,8 - adc %rax, %r14 - add %r10, %r14 - mov (%r11), %rdx - test %ecx, %ecx - .byte 0xc4,98,171,0xf6,70,240 - .byte 0xc4,226,179,0xf6,70,248 - .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, -8(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,15 - .byte 0xf3,72,0x0f,0x38,0xf6,193 - adc %rcx, %rax - .byte 0xc4,98,171,0xf6,6 - add %rax, %r14 - add %r10, %r14 - mov 8(%r11), %rdx - .byte 0xc4,226,243,0xf6,70,240 - add %r9, %rcx - mov %rcx, (%rdi) - adc $0, %rax - .byte 0xc4,98,171,0xf6,70,248 - add %rax, %r14 - add %r10, %r14 - mov %r14, 8(%rdi) + + jmp *%r8 + +.Lret: pop %r15 + pop %r14 + pop %r13 pop %rbp pop %rbx - pop %r12 - pop %r14 ret .size __gmpn_mullo_basecase,.-__gmpn_mullo_basecase - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Lmtab:.long .Lmf7-.Lmtab - .long .Lmf0-.Lmtab - .long .Lmf1-.Lmtab - .long .Lmf2-.Lmtab - .long .Lmf3-.Lmtab - .long .Lmf4-.Lmtab - .long .Lmf5-.Lmtab - .long .Lmf6-.Lmtab diff --git a/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s b/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s new file mode 100644 index 0000000000..b607e84aca --- /dev/null +++ b/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s @@ -0,0 +1,573 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_mulmid_basecase + .type __gmpn_mulmid_basecase,@function + +__gmpn_mulmid_basecase: + + + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rcx, %r15 + + + lea 1(%rdx), %r13 + sub %r8, %r13 + + lea (%rdi,%r13,8), %rdi + + cmp $4, %r13 + jc .Ldiagonal + + lea (%rsi,%rdx,8), %rsi + + test $1, %r8 + jz .Lmul_2 + + + + +.Lmul_1: + mov %r13d, %ebx + + neg %r13 + mov (%rsi,%r13,8), %rax + mov (%r15), %r12 + mul %r12 + + and $-4, %r13 + mov %r13, %r11 + + and $3, %ebx + jz .Lmul_1_prologue_0 + cmp $2, %ebx + jc .Lmul_1_prologue_1 + jz .Lmul_1_prologue_2 + +.Lmul_1_prologue_3: + mov %rax, %r10 + mov %rdx, %rbx + lea .Laddmul_prologue_3(%rip), %r14 + jmp .Lmul_1_entry_3 + + .align 16, 0x90 +.Lmul_1_prologue_0: + mov %rax, %rbp + mov %rdx, %r10 + lea .Laddmul_prologue_0(%rip), %r14 + jmp .Lmul_1_entry_0 + + .align 16, 0x90 +.Lmul_1_prologue_1: + add $4, %r11 + mov %rax, %rcx + mov %rdx, %rbp + mov $0, %r10d + mov (%rsi,%r11,8), %rax + lea .Laddmul_prologue_1(%rip), %r14 + jmp .Lmul_1_entry_1 + + .align 16, 0x90 +.Lmul_1_prologue_2: + mov %rax, %rbx + mov %rdx, %rcx + mov 24(%rsi,%r11,8), %rax + mov $0, %ebp + mov $0, %r10d + lea .Laddmul_prologue_2(%rip), %r14 + jmp .Lmul_1_entry_2 + + + + + .align 16, 0x90 +.Lmul_1_top: + mov %rbx, -16(%rdi,%r11,8) + add %rax, %rcx + mov (%rsi,%r11,8), %rax + adc %rdx, %rbp +.Lmul_1_entry_1: + mov $0, %ebx + mul %r12 + mov %rcx, -8(%rdi,%r11,8) + add %rax, %rbp + adc %rdx, %r10 +.Lmul_1_entry_0: + mov 8(%rsi,%r11,8), %rax + mul %r12 + mov %rbp, (%rdi,%r11,8) + add %rax, %r10 + adc %rdx, %rbx +.Lmul_1_entry_3: + mov 16(%rsi,%r11,8), %rax + mul %r12 + mov %r10, 8(%rdi,%r11,8) + mov $0, %ebp + mov %rbp, %r10 + add %rax, %rbx + mov 24(%rsi,%r11,8), %rax + mov %rbp, %rcx + adc %rdx, %rcx +.Lmul_1_entry_2: + mul %r12 + add $4, %r11 + js .Lmul_1_top + + mov %rbx, -16(%rdi) + add %rax, %rcx + mov %rcx, -8(%rdi) + mov %rbp, 8(%rdi) + adc %rdx, %rbp + mov %rbp, (%rdi) + + dec %r8 + jz .Lret + + lea -8(%rsi), %rsi + lea 8(%r15), %r15 + + mov %r13, %r11 + mov (%r15), %r12 + mov 8(%r15), %r9 + + jmp *%r14 + + + + + .align 16, 0x90 +.Lmul_2: + mov %r13d, %ebx + + neg %r13 + mov -8(%rsi,%r13,8), %rax + mov (%r15), %r12 + mov 8(%r15), %r9 + mul %r9 + + and $-4, %r13 + mov %r13, %r11 + + and $3, %ebx + jz .Lmul_2_prologue_0 + cmp $2, %ebx + jc .Lmul_2_prologue_1 + jz .Lmul_2_prologue_2 + +.Lmul_2_prologue_3: + mov %rax, %rcx + mov %rdx, %rbp + lea .Laddmul_prologue_3(%rip), %r14 + jmp .Lmul_2_entry_3 + + .align 16, 0x90 +.Lmul_2_prologue_0: + mov %rax, %rbx + mov %rdx, %rcx + lea .Laddmul_prologue_0(%rip), %r14 + jmp .Lmul_2_entry_0 + + .align 16, 0x90 +.Lmul_2_prologue_1: + mov %rax, %r10 + mov %rdx, %rbx + mov $0, %ecx + lea .Laddmul_prologue_1(%rip), %r14 + jmp .Lmul_2_entry_1 + + .align 16, 0x90 +.Lmul_2_prologue_2: + mov %rax, %rbp + mov %rdx, %r10 + mov $0, %ebx + mov 16(%rsi,%r11,8), %rax + lea .Laddmul_prologue_2(%rip), %r14 + jmp .Lmul_2_entry_2 + + + + + .align 16, 0x90 +.Lmul_2_top: + mov -8(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rbx + adc %rdx, %rcx +.Lmul_2_entry_0: + mov $0, %ebp + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %rbx + mov (%rsi,%r11,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r9 + add %rax, %rcx + mov %rbx, (%rdi,%r11,8) + adc %rdx, %rbp +.Lmul_2_entry_3: + mov 8(%rsi,%r11,8), %rax + mul %r12 + mov $0, %r10d + add %rax, %rcx + adc %rdx, %rbp + mov $0, %ebx + adc $0, %r10d + mov 8(%rsi,%r11,8), %rax + mov %rcx, 8(%rdi,%r11,8) + mul %r9 + add %rax, %rbp + mov 16(%rsi,%r11,8), %rax + adc %rdx, %r10 +.Lmul_2_entry_2: + mov $0, %ecx + mul %r12 + add %rax, %rbp + mov 16(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r9 + add %rax, %r10 + mov %rbp, 16(%rdi,%r11,8) + adc %rdx, %rbx +.Lmul_2_entry_1: + mov 24(%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + add $4, %r11 + mov %r10, -8(%rdi,%r11,8) + jnz .Lmul_2_top + + mov %rbx, (%rdi) + mov %rcx, 8(%rdi) + + sub $2, %r8 + jz .Lret + + lea 16(%r15), %r15 + lea -16(%rsi), %rsi + + mov %r13, %r11 + mov (%r15), %r12 + mov 8(%r15), %r9 + + jmp *%r14 + + + + + .align 16, 0x90 +.Laddmul_prologue_0: + mov -8(%rsi,%r11,8), %rax + mul %r9 + mov %rax, %rcx + mov %rdx, %rbp + mov $0, %r10d + jmp .Laddmul_entry_0 + + .align 16, 0x90 +.Laddmul_prologue_1: + mov 16(%rsi,%r11,8), %rax + mul %r9 + mov %rax, %rbx + mov %rdx, %rcx + mov $0, %ebp + mov 24(%rsi,%r11,8), %rax + jmp .Laddmul_entry_1 + + .align 16, 0x90 +.Laddmul_prologue_2: + mov 8(%rsi,%r11,8), %rax + mul %r9 + mov %rax, %r10 + mov %rdx, %rbx + mov $0, %ecx + jmp .Laddmul_entry_2 + + .align 16, 0x90 +.Laddmul_prologue_3: + mov (%rsi,%r11,8), %rax + mul %r9 + mov %rax, %rbp + mov %rdx, %r10 + mov $0, %ebx + mov $0, %ecx + jmp .Laddmul_entry_3 + + + + .align 16, 0x90 +.Laddmul_top: + mov $0, %r10d + add %rax, %rbx + mov -8(%rsi,%r11,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r9 + add %rbx, -8(%rdi,%r11,8) + adc %rax, %rcx + adc %rdx, %rbp +.Laddmul_entry_0: + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %rcx + mov (%rsi,%r11,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r9 + add %rcx, (%rdi,%r11,8) + mov $0, %ecx + adc %rax, %rbp + mov $0, %ebx + adc %rdx, %r10 +.Laddmul_entry_3: + mov 8(%rsi,%r11,8), %rax + mul %r12 + add %rax, %rbp + mov 8(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r9 + add %rbp, 8(%rdi,%r11,8) + adc %rax, %r10 + adc %rdx, %rbx +.Laddmul_entry_2: + mov 16(%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + mov 16(%rsi,%r11,8), %rax + adc %rdx, %rbx + adc $0, %ecx + mul %r9 + add %r10, 16(%rdi,%r11,8) + nop + adc %rax, %rbx + mov $0, %ebp + mov 24(%rsi,%r11,8), %rax + adc %rdx, %rcx +.Laddmul_entry_1: + mul %r12 + add $4, %r11 + jnz .Laddmul_top + + add %rax, %rbx + adc %rdx, %rcx + adc $0, %ebp + + add %rbx, -8(%rdi) + adc %rcx, (%rdi) + adc %rbp, 8(%rdi) + + sub $2, %r8 + jz .Lret + + lea 16(%r15), %r15 + lea -16(%rsi), %rsi + + mov %r13, %r11 + mov (%r15), %r12 + mov 8(%r15), %r9 + + jmp *%r14 + + + + + .align 16, 0x90 +.Ldiagonal: + xor %ebx, %ebx + xor %ecx, %ecx + xor %ebp, %ebp + + neg %r13 + + mov %r8d, %eax + and $3, %eax + jz .Ldiag_prologue_0 + cmp $2, %eax + jc .Ldiag_prologue_1 + jz .Ldiag_prologue_2 + +.Ldiag_prologue_3: + lea -8(%r15), %r15 + mov %r15, %r10 + add $1, %r8 + mov %r8, %r11 + lea .Ldiag_entry_3(%rip), %r14 + jmp .Ldiag_entry_3 + +.Ldiag_prologue_0: + mov %r15, %r10 + mov %r8, %r11 + lea 0(%rip), %r14 + mov -8(%rsi,%r11,8), %rax + jmp .Ldiag_entry_0 + +.Ldiag_prologue_1: + lea 8(%r15), %r15 + mov %r15, %r10 + add $3, %r8 + mov %r8, %r11 + lea 0(%rip), %r14 + mov -8(%r10), %rax + jmp .Ldiag_entry_1 + +.Ldiag_prologue_2: + lea -16(%r15), %r15 + mov %r15, %r10 + add $2, %r8 + mov %r8, %r11 + lea 0(%rip), %r14 + mov 16(%r10), %rax + jmp .Ldiag_entry_2 + + + + + .align 16, 0x90 +.Ldiag_top: + add %rax, %rbx + adc %rdx, %rcx + mov -8(%rsi,%r11,8), %rax + adc $0, %rbp +.Ldiag_entry_0: + mulq (%r10) + add %rax, %rbx + adc %rdx, %rcx + adc $0, %rbp +.Ldiag_entry_3: + mov -16(%rsi,%r11,8), %rax + mulq 8(%r10) + add %rax, %rbx + mov 16(%r10), %rax + adc %rdx, %rcx + adc $0, %rbp +.Ldiag_entry_2: + mulq -24(%rsi,%r11,8) + add %rax, %rbx + mov 24(%r10), %rax + adc %rdx, %rcx + lea 32(%r10), %r10 + adc $0, %rbp +.Ldiag_entry_1: + mulq -32(%rsi,%r11,8) + sub $4, %r11 + jnz .Ldiag_top + + add %rax, %rbx + adc %rdx, %rcx + adc $0, %rbp + + mov %rbx, (%rdi,%r13,8) + + inc %r13 + jz .Ldiag_end + + mov %r8, %r11 + mov %r15, %r10 + + lea 8(%rsi), %rsi + mov %rcx, %rbx + mov %rbp, %rcx + xor %ebp, %ebp + + jmp *%r14 + +.Ldiag_end: + mov %rcx, (%rdi) + mov %rbp, 8(%rdi) + +.Lret: pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + .size __gmpn_mulmid_basecase,.-__gmpn_mulmid_basecase diff --git a/ext/gmp/gen/x86_64-linux/mpn/nand_n.s b/ext/gmp/gen/x86_64-linux/mpn/nand_n.s index ad4e827623..04593b9b51 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/nand_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/nand_n.s @@ -94,7 +94,6 @@ - .text @@ -107,56 +106,48 @@ __gmpn_nand_n: mov (%rdx), %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: and (%rsi), %r8 +.Lb11: and (%rsi,%rcx,8), %r8 not %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: and (%rsi), %r8 + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: and (%rsi,%rcx,8), %r8 not %r8 - mov %r8, (%rdi) - dec %rcx + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 -.Lb00: mov 8(%rdx), %r9 - and (%rsi), %r8 + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + and (%rsi,%rcx,8), %r8 not %r8 - and 8(%rsi), %r9 + and 8(%rsi,%rcx,8), %r9 not %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 -.Le10: mov 24(%rdx), %r9 - lea 32(%rdx), %rdx - and 16(%rsi), %r8 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + and 16(%rsi,%rcx,8), %r8 not %r8 - and 24(%rsi), %r9 - lea 32(%rsi), %rsi + and 24(%rsi,%rcx,8), %r9 not %r9 - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/nior_n.s b/ext/gmp/gen/x86_64-linux/mpn/nior_n.s index 68dffa7222..8ea0437f09 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/nior_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/nior_n.s @@ -94,7 +94,6 @@ - .text @@ -107,56 +106,48 @@ __gmpn_nior_n: mov (%rdx), %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: or (%rsi), %r8 +.Lb11: or (%rsi,%rcx,8), %r8 not %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: or (%rsi), %r8 + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: or (%rsi,%rcx,8), %r8 not %r8 - mov %r8, (%rdi) - dec %rcx + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 -.Lb00: mov 8(%rdx), %r9 - or (%rsi), %r8 + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + or (%rsi,%rcx,8), %r8 not %r8 - or 8(%rsi), %r9 + or 8(%rsi,%rcx,8), %r9 not %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 -.Le10: mov 24(%rdx), %r9 - lea 32(%rdx), %rdx - or 16(%rsi), %r8 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + or 16(%rsi,%rcx,8), %r8 not %r8 - or 24(%rsi), %r9 - lea 32(%rsi), %rsi + or 24(%rsi,%rcx,8), %r9 not %r9 - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/popcount.s b/ext/gmp/gen/x86_64-linux/mpn/popcount.s index d118f5bda4..243219e87c 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/popcount.s +++ b/ext/gmp/gen/x86_64-linux/mpn/popcount.s @@ -59,16 +59,15 @@ - - - - - - - - - - + + + + + + + + + @@ -86,110 +85,76 @@ __gmpn_popcount: - - - mov %esi, %r8d - and $7, %r8d - - .byte 0xf3,0x48,0x0f,0xb8,0x07 - xor %ecx, %ecx - - lea .Ltab(%rip), %r9 - - movslq (%r9,%r8,4), %r8 - add %r9, %r8 - jmp *%r8 - - -.L3: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 - add $24, %rdi - sub $8, %rsi - jg .Le34 - add %r10, %rax - add %r11, %rax -.Ls1: - ret - -.L1: sub $8, %rsi - jle .Ls1 - .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 - add $8, %rdi - jmp .Le12 - -.L7: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 - add $-8, %rdi - jmp .Le07 - -.L0: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 - jmp .Le07 - -.L4: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 - add $32, %rdi - sub $8, %rsi - jle .Lx4 + + push %rbx + mov $0x5555555555555555, %r10 + push %rbp + mov $0x3333333333333333, %r11 + lea (%rdi,%rsi,8), %rdi + mov $0x0f0f0f0f0f0f0f0f, %rcx + neg %rsi + mov $0x0101010101010101, %rdx + xor %eax, %eax + test $1, %sil + jz .Ltop + + mov (%rdi,%rsi,8), %r8 + + mov %r8, %r9 + shr %r8 + and %r10, %r8 + sub %r8, %r9 + + mov %r9, %r8 + shr $2, %r9 + and %r11, %r8 + and %r11, %r9 + add %r8, %r9 + + dec %rsi + jmp .Lmid .align 16, 0x90 -.Ltop: -.Le34: .byte 0xf3,0x4c,0x0f,0xb8,0x07 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x08 - add %r10, %rcx - add %r11, %rax -.Le12: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 - add %r8, %rcx - add %r9, %rax -.Le07: .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x20 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x28 - add %r10, %rcx - add %r11, %rax -.Le56: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x30 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x38 - add $64, %rdi - add %r8, %rcx - add %r9, %rax - sub $8, %rsi - jg .Ltop - -.Lx4: add %r10, %rcx - add %r11, %rax -.Lx2: add %rcx, %rax - +.Ltop: mov (%rdi,%rsi,8), %r8 + mov 8(%rdi,%rsi,8), %rbx + + mov %r8, %r9 + mov %rbx, %rbp + shr %r8 + shr %rbx + and %r10, %r8 + and %r10, %rbx + sub %r8, %r9 + sub %rbx, %rbp + + mov %r9, %r8 + mov %rbp, %rbx + shr $2, %r9 + shr $2, %rbp + and %r11, %r8 + and %r11, %r9 + and %r11, %rbx + and %r11, %rbp + add %r8, %r9 + add %rbx, %rbp + + add %rbp, %r9 +.Lmid: mov %r9, %r8 + shr $4, %r9 + and %rcx, %r8 + and %rcx, %r9 + add %r8, %r9 + + imul %rdx, %r9 + shr $56, %r9 + + add %r9, %rax + add $2, %rsi + jnc .Ltop + +.Lend: + pop %rbp + pop %rbx ret - -.L2: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 - sub $8, %rsi - jle .Lx2 - .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 - add $16, %rdi - jmp .Le12 - -.L5: .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 - add $-24, %rdi - jmp .Le56 - -.L6: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 - add $-16, %rdi - jmp .Le56 .size __gmpn_popcount,.-__gmpn_popcount - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab - .long .L4-.Ltab - .long .L5-.Ltab - .long .L6-.Ltab - .long .L7-.Ltab diff --git a/ext/gmp/gen/x86_64-linux/mpn/redc_1.s b/ext/gmp/gen/x86_64-linux/mpn/redc_1.s index a5912b7b6d..da7fd88758 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/redc_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/redc_1.s @@ -63,6 +63,11 @@ + + + + + @@ -77,14 +82,15 @@ - - + + + .text - .align 16, 0x90 + .align 32, 0x90 .globl __gmpn_redc_1 .type __gmpn_redc_1,@function @@ -92,356 +98,506 @@ __gmpn_redc_1: - push %rbx push %rbp + mov (%rsi), %rbp + push %rbx + imul %r8, %rbp push %r12 push %r13 push %r14 push %r15 - push %rdi - mov %rdx, %rdi - mov (%rsi), %rdx - - neg %rcx - push %r8 - imul %r8, %rdx - mov %rcx, %r15 - - test $1, %cl - jnz .Lbx1 - -.Lbx0: test $2, %cl - jz .Lo0b - - cmp $-2, %ecx - jnz .Lo2 - - - mov 8(%rsp), %rbx - lea 16(%rsp), %rsp - .byte 0xc4,98,179,0xf6,39 - .byte 0xc4,98,163,0xf6,87,8 - add %r12, %r11 - adc $0, %r10 - add (%rsi), %r9 - adc 8(%rsi), %r11 - adc $0, %r10 - mov %r11, %rdx - imul %r8, %rdx - .byte 0xc4,98,147,0xf6,39 - .byte 0xc4,98,139,0xf6,127,8 - xor %eax, %eax - add %r12, %r14 - adc $0, %r15 - add %r11, %r13 - adc 16(%rsi), %r14 - adc $0, %r15 - add %r14, %r10 - adc 24(%rsi), %r15 - mov %r10, (%rbx) - mov %r15, 8(%rbx) - setc %al - jmp .Lret -.Lo2: lea 2(%rcx), %r14 - .byte 0xc4,98,179,0xf6,7 - .byte 0xc4,98,163,0xf6,87,8 - sar $2, %r14 - add %r8, %r11 - jmp .Llo2 + mov %rcx, %r12 + neg %r12 + lea (%rdx,%rcx,8), %r13 + lea -16(%rsi,%rcx,8), %rsi + + mov %ecx, %eax + and $3, %eax + lea 4(%rax), %r9 + cmp $4, %ecx + cmovg %r9, %rax + lea .Ltab(%rip), %r9 + + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax + + + .section .data.rel.ro.local,"a",@progbits + .align 8, 0x90 +.Ltab: .long .L0-.Ltab + .long .L1-.Ltab + .long .L2-.Ltab + .long .L3-.Ltab + .long .L0m4-.Ltab + .long .L1m4-.Ltab + .long .L2m4-.Ltab + .long .L3m4-.Ltab + .text .align 16, 0x90 -.Ltp2: adc %rax, %r9 - lea 32(%rsi), %rsi - adc %r8, %r11 -.Llo2: .byte 0xc4,98,147,0xf6,103,16 - mov (%rsi), %r8 - .byte 0xc4,226,227,0xf6,71,24 - lea 32(%rdi), %rdi - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov 8(%rsi), %r10 - mov 16(%rsi), %r12 - add %r9, %r8 - mov 24(%rsi), %rbp - mov %r8, (%rsi) - adc %r11, %r10 - .byte 0xc4,98,179,0xf6,7 - mov %r10, 8(%rsi) - adc %r13, %r12 - mov %r12, 16(%rsi) - adc %rbx, %rbp - .byte 0xc4,98,163,0xf6,87,8 - mov %rbp, 24(%rsi) - inc %r14 - jnz .Ltp2 - -.Led2: mov 56(%rsi,%rcx,8), %rdx - lea 16(%rdi,%rcx,8), %rdi - adc %rax, %r9 - adc %r8, %r11 - mov 32(%rsi), %r8 - adc $0, %r10 - imul (%rsp), %rdx - mov 40(%rsi), %rax - add %r9, %r8 - mov %r8, 32(%rsi) - adc %r11, %rax - mov %rax, 40(%rsi) - lea 56(%rsi,%rcx,8), %rsi - adc $0, %r10 - mov %r10, -8(%rsi) - inc %r15 - jnz .Lo2 - - jmp .Lcj - - -.Lbx1: test $2, %cl - jz .Lo3a - -.Lo1a: cmp $-1, %ecx - jnz .Lo1b - - - mov 8(%rsp), %rbx - lea 16(%rsp), %rsp - .byte 0xc4,98,163,0xf6,23 - add (%rsi), %r11 - adc 8(%rsi), %r10 - mov %r10, (%rbx) +.L1: mov (%rdx), %rax + mul %rbp + add 8(%rsi), %rax + adc 16(%rsi), %rdx + mov %rdx, (%rdi) mov $0, %eax - setc %al + adc %eax, %eax jmp .Lret -.Lo1b: lea 24(%rdi), %rdi -.Lo1: lea 1(%rcx), %r14 - .byte 0xc4,98,163,0xf6,87,232 - .byte 0xc4,98,147,0xf6,103,240 - .byte 0xc4,226,227,0xf6,71,248 - sar $2, %r14 - add %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov (%rsi), %r10 - mov 8(%rsi), %r12 - mov 16(%rsi), %rbp - add %r11, %r10 - jmp .Llo1 .align 16, 0x90 -.Ltp1: adc %rax, %r9 - lea 32(%rsi), %rsi - adc %r8, %r11 - .byte 0xc4,98,147,0xf6,103,16 - mov -8(%rsi), %r8 - .byte 0xc4,226,227,0xf6,71,24 - lea 32(%rdi), %rdi - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov (%rsi), %r10 - mov 8(%rsi), %r12 - add %r9, %r8 - mov 16(%rsi), %rbp - mov %r8, -8(%rsi) - adc %r11, %r10 -.Llo1: .byte 0xc4,98,179,0xf6,7 +.L2: mov (%rdx), %rax + mul %rbp + xor %r14d, %r14d + mov %rax, %r10 + mov -8(%r13), %rax + mov %rdx, %r9 + mul %rbp + add (%rsi), %r10 + adc %rax, %r9 + adc %rdx, %r14 + add 8(%rsi), %r9 + adc $0, %r14 + mov %r9, %rbp + imul %r8, %rbp + mov -16(%r13), %rax + mul %rbp + xor %ebx, %ebx + mov %rax, %r10 + mov -8(%r13), %rax + mov %rdx, %r11 + mul %rbp + add %r9, %r10 + adc %rax, %r11 + adc %rdx, %rbx + add 16(%rsi), %r11 + adc $0, %rbx + xor %eax, %eax + add %r11, %r14 + adc 24(%rsi), %rbx + mov %r14, (%rdi) + mov %rbx, 8(%rdi) + adc %eax, %eax + jmp .Lret + + +.L3: mov (%rdx), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov -16(%r13), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d + add -8(%rsi), %rbx + adc %rax, %r10 + mov -8(%r13), %rax + adc %rdx, %r9 + mul %rbp + add (%rsi), %r10 mov %r10, (%rsi) - adc %r13, %r12 - mov %r12, 8(%rsi) - adc %rbx, %rbp - .byte 0xc4,98,163,0xf6,87,8 - mov %rbp, 16(%rsi) - inc %r14 - jnz .Ltp1 - -.Led1: mov 48(%rsi,%rcx,8), %rdx - lea 40(%rdi,%rcx,8), %rdi adc %rax, %r9 - adc %r8, %r11 - mov 24(%rsi), %r8 - adc $0, %r10 - imul (%rsp), %rdx - mov 32(%rsi), %rax - add %r9, %r8 - mov %r8, 24(%rsi) - adc %r11, %rax - mov %rax, 32(%rsi) - lea 48(%rsi,%rcx,8), %rsi - adc $0, %r10 - mov %r10, -8(%rsi) - inc %r15 - jnz .Lo1 - - jmp .Lcj - -.Lo3a: cmp $-3, %ecx - jnz .Lo3b - - -.Ln3: .byte 0xc4,226,227,0xf6,7 - .byte 0xc4,98,179,0xf6,119,8 + adc %rdx, %r14 + mov %r10, %rbp + imul %r8, %rbp + add %r9, 8(%rsi) + adc $0, %r14 + mov %r14, -8(%rsi) + + mov -24(%r13), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov -16(%r13), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d add (%rsi), %rbx - .byte 0xc4,98,163,0xf6,87,16 - adc %rax, %r9 - adc %r14, %r11 - mov 8(%rsi), %r14 - mov %r8, %rdx - adc $0, %r10 - mov 16(%rsi), %rax - add %r9, %r14 - mov %r14, 8(%rsi) - .byte 0xc4,66,235,0xf6,238 - adc %r11, %rax - mov %rax, 16(%rsi) - adc $0, %r10 - mov %r10, (%rsi) - lea 8(%rsi), %rsi - inc %r15 - jnz .Ln3 - - jmp .Lcj - -.Lo3b: lea 8(%rdi), %rdi -.Lo3: lea 4(%rcx), %r14 - .byte 0xc4,226,227,0xf6,71,248 - .byte 0xc4,98,179,0xf6,7 - mov (%rsi), %rbp - .byte 0xc4,98,163,0xf6,87,8 - sar $2, %r14 - add %rbx, %rbp - nop + adc %rax, %r10 + mov -8(%r13), %rax + adc %rdx, %r9 + mul %rbp + add 8(%rsi), %r10 + mov %r10, 8(%rsi) + adc %rax, %r9 + adc %rdx, %r14 + mov %r10, %rbp + imul %r8, %rbp + add %r9, 16(%rsi) + adc $0, %r14 + mov %r14, (%rsi) + + mov -24(%r13), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov -16(%r13), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d + add 8(%rsi), %rbx + adc %rax, %r10 + mov -8(%r13), %rax + adc %rdx, %r9 + mul %rbp + add 16(%rsi), %r10 adc %rax, %r9 - jmp .Llo3 + adc %rdx, %r14 + add 24(%rsi), %r9 + adc $0, %r14 + + xor %eax, %eax + add -8(%rsi), %r10 + adc (%rsi), %r9 + adc 32(%rsi), %r14 + mov %r10, (%rdi) + mov %r9, 8(%rdi) + mov %r14, 16(%rdi) + adc %eax, %eax + jmp .Lret + .align 16, 0x90 -.Ltp3: adc %rax, %r9 - lea 32(%rsi), %rsi -.Llo3: adc %r8, %r11 - .byte 0xc4,98,147,0xf6,103,16 - mov 8(%rsi), %r8 - .byte 0xc4,226,227,0xf6,71,24 - lea 32(%rdi), %rdi - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov 16(%rsi), %r10 - mov 24(%rsi), %r12 - add %r9, %r8 - mov 32(%rsi), %rbp - mov %r8, 8(%rsi) - adc %r11, %r10 - .byte 0xc4,98,179,0xf6,7 - mov %r10, 16(%rsi) - adc %r13, %r12 - mov %r12, 24(%rsi) - adc %rbx, %rbp - .byte 0xc4,98,163,0xf6,87,8 - mov %rbp, 32(%rsi) - inc %r14 - jnz .Ltp3 - -.Led3: mov 64(%rsi,%rcx,8), %rdx - lea 24(%rdi,%rcx,8), %rdi +.L2m4: +.Llo2: mov (%r13,%r12,8), %rax + mul %rbp + xor %r14d, %r14d + xor %ebx, %ebx + mov %rax, %r10 + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mov %rdx, %r9 + mul %rbp + add 16(%rsi,%r12,8), %r10 adc %rax, %r9 - adc %r8, %r11 - mov 40(%rsi), %r8 - adc $0, %r10 - imul (%rsp), %rdx - mov 48(%rsi), %rax - add %r9, %r8 - mov %r8, 40(%rsi) - adc %r11, %rax - mov %rax, 48(%rsi) - lea 64(%rsi,%rcx,8), %rsi - adc $0, %r10 - mov %r10, -8(%rsi) - inc %r15 - jnz .Lo3 - - jmp .Lcj - -.Lo0b: lea 16(%rdi), %rdi -.Lo0: mov %rcx, %r14 - .byte 0xc4,98,147,0xf6,103,240 - .byte 0xc4,226,227,0xf6,71,248 - sar $2, %r14 - add %r12, %rbx - adc $0, %rax - mov (%rsi), %r12 - mov 8(%rsi), %rbp - .byte 0xc4,98,179,0xf6,7 - add %r13, %r12 - jmp .Llo0 + mov 16(%r13,%r12,8), %rax + adc %rdx, %r14 + mul %rbp + mov $0, %r10d + lea 2(%r12), %r11 + add %r9, %r15 + imul %r8, %r15 + jmp .Le2 .align 16, 0x90 -.Ltp0: adc %rax, %r9 - lea 32(%rsi), %rsi - adc %r8, %r11 - .byte 0xc4,98,147,0xf6,103,16 +.Lli2: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp +.Le2: add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp + add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli2 + +.Lle2: add %r10, (%rsi) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + add $8, %rsi + mov %r15, %rbp + dec %rcx + jnz .Llo2 + + mov %r12, %rcx + sar $2, %rcx + lea 32(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx + mov -16(%rsi), %r8 - .byte 0xc4,226,227,0xf6,71,24 - lea 32(%rdi), %rdi - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov -8(%rsi), %r10 - mov (%rsi), %r12 - add %r9, %r8 - mov 8(%rsi), %rbp - mov %r8, -16(%rsi) - adc %r11, %r10 - .byte 0xc4,98,179,0xf6,7 - mov %r10, -8(%rsi) - adc %r13, %r12 - mov %r12, (%rsi) -.Llo0: adc %rbx, %rbp - .byte 0xc4,98,163,0xf6,87,8 - mov %rbp, 8(%rsi) - inc %r14 - jnz .Ltp0 - -.Led0: mov 40(%rsi,%rcx,8), %rdx - lea 32(%rdi,%rcx,8), %rdi + mov -8(%rsi), %r9 + add -16(%rdx), %r8 + adc -8(%rdx), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + lea 16(%rdi), %rdi + jmp .Laddx + + + .align 16, 0x90 +.L1m4: +.Llo1: mov (%r13,%r12,8), %rax + xor %r9, %r9 + xor %ebx, %ebx + mul %rbp + mov %rax, %r9 + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mov %rdx, %r14 + mov $0, %r10d + mul %rbp + add 16(%rsi,%r12,8), %r9 + adc %rax, %r14 + adc %rdx, %rbx + mov 16(%r13,%r12,8), %rax + mul %rbp + lea 1(%r12), %r11 + add %r14, %r15 + imul %r8, %r15 + jmp .Le1 + + .align 16, 0x90 +.Lli1: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp + add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp +.Le1: add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli1 + +.Lle1: add %r10, (%rsi) adc %rax, %r9 - adc %r8, %r11 - mov 16(%rsi), %r8 - adc $0, %r10 - imul (%rsp), %rdx - mov 24(%rsi), %rax - add %r9, %r8 - mov %r8, 16(%rsi) - adc %r11, %rax - mov %rax, 24(%rsi) - lea 40(%rsi,%rcx,8), %rsi - adc $0, %r10 - mov %r10, -8(%rsi) - inc %r15 - jnz .Lo0 - -.Lcj: - mov 8(%rsp), %rdi - lea 16-8(%rsp), %rsp - lea (%rsi,%rcx,8), %rdx - neg %ecx + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + add $8, %rsi + mov %r15, %rbp + dec %rcx + jnz .Llo1 + + mov %r12, %rcx + sar $2, %rcx + lea 24(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx - + mov -8(%rsi), %r8 + add -8(%rdx), %r8 + mov %r8, (%rdi) + lea 8(%rdi), %rdi + jmp .Laddx - - call __gmpn_add_n@PLT + + .align 16, 0x90 +.L0: +.L0m4: +.Llo0: mov (%r13,%r12,8), %rax + mov %r12, %r11 + mul %rbp + xor %r10d, %r10d + mov %rax, %r14 + mov %rdx, %rbx + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mul %rbp + add 16(%rsi,%r12,8), %r14 + adc %rax, %rbx + adc %rdx, %r10 + add %rbx, %r15 + imul %r8, %r15 + jmp .Le0 + + .align 16, 0x90 +.Lli0: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp + add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp + add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 +.Le0: mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli0 + +.Lle0: add %r10, (%rsi) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + add $8, %rsi + mov %r15, %rbp + dec %rcx + jnz .Llo0 + + mov %r12, %rcx + sar $2, %rcx + clc + lea 16(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx + jmp .Laddy + + + .align 16, 0x90 +.L3m4: +.Llo3: mov (%r13,%r12,8), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mul %rbp + add 16(%rsi,%r12,8), %rbx + mov $0, %ebx + mov %rbx, %r14 + adc %rax, %r10 + mov 16(%r13,%r12,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + add %r10, %r15 + mul %rbp + lea 3(%r12), %r11 + imul %r8, %r15 + + + .align 16, 0x90 +.Lli3: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp + add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp + add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli3 + +.Lle3: add %r10, (%rsi) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + mov %r15, %rbp + lea 8(%rsi), %rsi + dec %rcx + jnz .Llo3 - lea 8(%rsp), %rsp + mov %r12, %rcx + sar $2, %rcx + lea 40(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx + + mov -24(%rsi), %r8 + mov -16(%rsi), %r9 + mov -8(%rsi), %r10 + add -24(%rdx), %r8 + adc -16(%rdx), %r9 + adc -8(%rdx), %r10 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + lea 24(%rdi), %rdi + +.Laddx:inc %rcx + jz .Lad3 + +.Laddy:mov (%rsi), %r8 + mov 8(%rsi), %r9 + inc %rcx + jmp .Lmid + + +.Lal3: adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + mov %r8, (%rdi) + lea 32(%rsi), %rsi + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + inc %rcx + mov %r11, 24(%rdi) + lea 32(%rdx), %rdx + mov (%rsi), %r8 + mov 8(%rsi), %r9 + lea 32(%rdi), %rdi +.Lmid: mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + jnz .Lal3 + +.Lae3: adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + mov %r11, 24(%rdi) + +.Lad3: mov %ecx, %eax + adc %eax, %eax .Lret: pop %r15 pop %r14 pop %r13 pop %r12 - pop %rbp pop %rbx + pop %rbp ret .size __gmpn_redc_1,.-__gmpn_redc_1 diff --git a/ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s b/ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s index 7eebcc0aff..ac1323b3c6 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s @@ -47,15 +47,6 @@ - - - - - - - - - @@ -78,6 +69,9 @@ + + + @@ -86,29 +80,6 @@ .text - .align 16, 0x90 - .globl __gmpn_rsblsh1_nc - .type __gmpn_rsblsh1_nc,@function - -__gmpn_rsblsh1_nc: - - - - push %rbp - mov %r8, %rax - neg %rax - xor %ebp, %ebp - mov (%rdx), %r8 - shrd $63, %r8, %rbp - mov %ecx, %r9d - and $3, %r9d - je .Lb00 - cmp $2, %r9d - jc .Lb01 - je .Lb10 - jmp .Lb11 - .size __gmpn_rsblsh1_nc,.-__gmpn_rsblsh1_nc - .align 16, 0x90 .globl __gmpn_rsblsh1_n .type __gmpn_rsblsh1_n,@function @@ -117,96 +88,92 @@ __gmpn_rsblsh1_n: push %rbp - xor %ebp, %ebp + mov (%rdx), %r8 - shrd $63, %r8, %rbp mov %ecx, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + xor %ebp, %ebp and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - mov 16(%rdx), %r10 - shrd $63, %r10, %r9 - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - sbb 16(%rsi), %r9 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, %rbp - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi +.Lb11: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 sbb %eax, %eax - sub $3, %rcx - ja .Ltop - jmp .Lend - -.Lb01: add %eax, %eax - sbb (%rsi), %rbp - mov %rbp, (%rdi) - mov %r8, %rbp - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi + sub (%rsi,%rcx,8), %r8 + sbb 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb 16(%rsi,%rcx,8), %r10 + mov %r10, 16(%rdi,%rcx,8) + sbb %ebp, %ebp + add $3, %rcx + jmp .Lent + +.Lb10: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 sbb %eax, %eax - sub $1, %rcx - ja .Ltop - jmp .Lend - -.Lb10: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, %rbp - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi + sub (%rsi,%rcx,8), %r8 + sbb 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb %ebp, %ebp + add $2, %rcx + jmp .Lent + +.Lb01: add %r8, %r8 sbb %eax, %eax - sub $2, %rcx - ja .Ltop - jmp .Lend + sub (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + sbb %ebp, %ebp + inc %rcx +.Lent: jns .Lend .align 16, 0x90 -.Ltop: mov (%rdx), %r8 - shrd $63, %r8, %rbp -.Lb00: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - mov 16(%rdx), %r10 - shrd $63, %r10, %r9 - mov 24(%rdx), %r11 - shrd $63, %r11, %r10 - lea 32(%rdx), %rdx - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - sbb 16(%rsi), %r9 - sbb 24(%rsi), %r10 - lea 32(%rsi), %rsi - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, 24(%rdi) - mov %r11, %rbp - lea 32(%rdi), %rdi +.Ltop: add %eax, %eax + + mov (%rdx,%rcx,8), %r8 +.Lb00: adc %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 + mov 24(%rdx,%rcx,8), %r11 + adc %r11, %r11 + sbb %eax, %eax - sub $4, %rcx - jnz .Ltop + add %ebp, %ebp + + sbb (%rsi,%rcx,8), %r8 + nop + sbb 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb 16(%rsi,%rcx,8), %r10 + sbb 24(%rsi,%rcx,8), %r11 + mov %r10, 16(%rdi,%rcx,8) + mov %r11, 24(%rdi,%rcx,8) + + sbb %ebp, %ebp + add $4, %rcx + js .Ltop + +.Lend: + + + sub %eax, %ebp + movslq %ebp, %rax -.Lend: shr $63, %rbp - add %eax, %eax - sbb $0, %rbp - mov %rbp, %rax pop %rbp ret .size __gmpn_rsblsh1_n,.-__gmpn_rsblsh1_n - diff --git a/ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s b/ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s index fe7d1d3930..e9f079a236 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s @@ -47,10 +47,13 @@ - - - - + + + + + + + @@ -87,30 +90,11 @@ - .text - .align 16, 0x90 - .globl __gmpn_rsblsh2_nc - .type __gmpn_rsblsh2_nc,@function - -__gmpn_rsblsh2_nc: - - push %rbp - mov %r8, %rax - neg %rax - xor %ebp, %ebp - mov (%rdx), %r8 - shrd $62, %r8, %rbp - mov %ecx, %r9d - and $3, %r9d - je .Lb00 - cmp $2, %r9d - jc .Lb01 - je .Lb10 - jmp .Lb11 - .size __gmpn_rsblsh2_nc,.-__gmpn_rsblsh2_nc + + .text .align 16, 0x90 .globl __gmpn_rsblsh2_n .type __gmpn_rsblsh2_n,@function @@ -118,96 +102,102 @@ __gmpn_rsblsh2_nc: __gmpn_rsblsh2_n: - push %rbp - xor %ebp, %ebp + push %r12 + push %r13 + push %r14 + push %r15 + mov (%rdx), %r8 - shrd $62, %r8, %rbp + lea (,%r8,4), %r12 + shr $62, %r8 + mov %ecx, %eax - and $3, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + and $3, %al je .Lb00 - cmp $2, %eax + cmp $2, %al jc .Lb01 je .Lb10 -.Lb11: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - mov 16(%rdx), %r10 - shrd $62, %r10, %r9 - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - sbb 16(%rsi), %r9 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, %rbp - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi - sbb %eax, %eax - sub $3, %rcx - ja .Ltop +.Lb11: mov 8(%rdx,%rcx,8), %r10 + lea (%r8,%r10,4), %r14 + shr $62, %r10 + mov 16(%rdx,%rcx,8), %r11 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + sub (%rsi,%rcx,8), %r12 + sbb 8(%rsi,%rcx,8), %r14 + sbb 16(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r14, 8(%rdi,%rcx,8) + mov %r15, 16(%rdi,%rcx,8) + add $3, %rcx + js .Ltop jmp .Lend -.Lb01: add %eax, %eax - sbb (%rsi), %rbp - mov %rbp, (%rdi) - mov %r8, %rbp - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - sbb %eax, %eax - sub $1, %rcx - ja .Ltop +.Lb01: mov %r8, %r11 + sub (%rsi,%rcx,8), %r12 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + add $1, %rcx + js .Ltop jmp .Lend -.Lb10: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, %rbp - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi - sbb %eax, %eax - sub $2, %rcx - ja .Ltop +.Lb10: mov 8(%rdx,%rcx,8), %r11 + lea (%r8,%r11,4), %r15 + shr $62, %r11 + sub (%rsi,%rcx,8), %r12 + sbb 8(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r15, 8(%rdi,%rcx,8) + add $2, %rcx + js .Ltop jmp .Lend +.Lb00: mov 8(%rdx,%rcx,8), %r9 + mov 16(%rdx,%rcx,8), %r10 + jmp .Le00 + .align 16, 0x90 -.Ltop: mov (%rdx), %r8 - shrd $62, %r8, %rbp -.Lb00: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - mov 16(%rdx), %r10 - shrd $62, %r10, %r9 - mov 24(%rdx), %r11 - shrd $62, %r11, %r10 - lea 32(%rdx), %rdx - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - sbb 16(%rsi), %r9 - sbb 24(%rsi), %r10 - lea 32(%rsi), %rsi - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, 24(%rdi) - mov %r11, %rbp - lea 32(%rdi), %rdi - sbb %eax, %eax - sub $4, %rcx - jnz .Ltop - -.Lend: shr $62, %rbp - add %eax, %eax - sbb $0, %rbp - mov %rbp, %rax - pop %rbp +.Ltop: mov 16(%rdx,%rcx,8), %r10 + mov (%rdx,%rcx,8), %r8 + mov 8(%rdx,%rcx,8), %r9 + lea (%r11,%r8,4), %r12 + shr $62, %r8 +.Le00: lea (%r8,%r9,4), %r13 + shr $62, %r9 + mov 24(%rdx,%rcx,8), %r11 + lea (%r9,%r10,4), %r14 + shr $62, %r10 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + add %eax, %eax + sbb (%rsi,%rcx,8), %r12 + sbb 8(%rsi,%rcx,8), %r13 + sbb 16(%rsi,%rcx,8), %r14 + sbb 24(%rsi,%rcx,8), %r15 + mov %r12, (%rdi,%rcx,8) + mov %r13, 8(%rdi,%rcx,8) + mov %r14, 16(%rdi,%rcx,8) + sbb %eax, %eax + mov %r15, 24(%rdi,%rcx,8) + add $4, %rcx + js .Ltop +.Lend: + + + add %r11d, %eax + movslq %eax, %rax + + pop %r15 + pop %r14 + pop %r13 + pop %r12 ret .size __gmpn_rsblsh2_n,.-__gmpn_rsblsh2_n diff --git a/ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s b/ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s index b64824b9f9..d439217a6c 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s @@ -66,32 +66,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - + @@ -103,7 +78,7 @@ .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_rsblsh_n .type __gmpn_rsblsh_n,@function @@ -111,142 +86,143 @@ __gmpn_rsblsh_n: + push %r12 + push %rbp + push %rbx + + mov (%rdx), %rax + + mov $0, %ebp + sub %rcx, %rbp + + lea -16(%rsi,%rcx,8), %rsi + lea -16(%rdi,%rcx,8), %rdi + lea 16(%rdx,%rcx,8), %r12 + + mov %rcx, %r9 + + mov %r8, %rcx + mov $1, %r8d + shl %cl, %r8 + + mul %r8 + + and $3, %r9d + jz .Lb0 + cmp $2, %r9d + jc .Lb1 + jz .Lb2 + +.Lb3: mov %rax, %r11 + sub 16(%rsi,%rbp,8), %r11 + mov -8(%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov (%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $3, %rbp + jnz .Llo3 + jmp .Lcj3 + +.Lb2: mov %rax, %rbx + mov -8(%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $2, %rbp + jz .Lcj2 + mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + xor %ecx, %ecx + jmp .Llo2 + +.Lb1: mov %rax, %r9 + mov %rdx, %r10 + add $1, %rbp + jnz .Lgt1 + sub 8(%rsi,%rbp,8), %r9 + jmp .Lcj1 +.Lgt1: mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + sub 8(%rsi,%rbp,8), %r9 + sbb 16(%rsi,%rbp,8), %r10 + sbb 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + jmp .Llo1 + +.Lb0: mov %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + sub 16(%rsi,%rbp,8), %r10 + sbb 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jz .Lend - mov (%rdx), %r10 - - mov %ecx, %eax - shr $3, %rcx - xor %r9d, %r9d - sub %r8, %r9 - and $7, %eax - - lea .Ltab(%rip), %r11 - - movslq (%r11,%rax,4), %rax - add %r11, %rax - jmp *%rax - - -.L0: lea 32(%rsi), %rsi - lea 32(%rdx), %rdx - lea 32(%rdi), %rdi - xor %r11d, %r11d - jmp .Le0 - -.L7: mov %r10, %r11 - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi - xor %r10d, %r10d - jmp .Le7 - -.L6: lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi - xor %r11d, %r11d - jmp .Le6 - -.L5: mov %r10, %r11 - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - xor %r10d, %r10d - jmp .Le5 - -.Lend: sbb 24(%rsi), %rax - mov %rax, -40(%rdi) - .byte 0xc4,194,179,0xf7,195 - sbb %rcx, %rax + .align 8, 0x90 +.Ltop: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Llo3: mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %r11, -8(%rdi,%rbp,8) +.Llo2: mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add %ecx, %ecx + sbb (%rsi,%rbp,8), %rbx + sbb 8(%rsi,%rbp,8), %r9 + sbb 16(%rsi,%rbp,8), %r10 + sbb 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rbx, (%rdi,%rbp,8) +.Llo1: mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov %r9, 8(%rdi,%rbp,8) +.Llo0: mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jnz .Ltop + +.Lend: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Lcj3: mov %r11, -8(%rdi,%rbp,8) +.Lcj2: add %ecx, %ecx + sbb (%rsi,%rbp,8), %rbx + sbb 8(%rsi,%rbp,8), %r9 + mov %rbx, (%rdi,%rbp,8) +.Lcj1: mov %r9, 8(%rdi,%rbp,8) + mov %rdx, %rax + sbb $0, %rax + pop %rbx + pop %rbp + pop %r12 ret - - .align 32, 0x90 -.Ltop: jrcxz .Lend - mov -32(%rdx), %r10 - sbb 24(%rsi), %rax - lea 64(%rsi), %rsi - .byte 0xc4,66,179,0xf7,219 - mov %rax, -40(%rdi) -.Le0: dec %rcx - .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov -24(%rdx), %r11 - sbb -32(%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, -32(%rdi) -.Le7: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov -16(%rdx), %r10 - sbb -24(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, -24(%rdi) -.Le6: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov -8(%rdx), %r11 - sbb -16(%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, -16(%rdi) -.Le5: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov (%rdx), %r10 - sbb -8(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, -8(%rdi) -.Le4: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov 8(%rdx), %r11 - sbb (%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, (%rdi) -.Le3: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov 16(%rdx), %r10 - sbb 8(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, 8(%rdi) -.Le2: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov 24(%rdx), %r11 - sbb 16(%rsi), %rax - lea 64(%rdx), %rdx - .byte 0xc4,66,179,0xf7,210 - mov %rax, 16(%rdi) - lea 64(%rdi), %rdi -.Le1: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - jmp .Ltop - -.L4: xor %r11d, %r11d - jmp .Le4 - -.L3: mov %r10, %r11 - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi - xor %r10d, %r10d - jmp .Le3 - -.L2: lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi - xor %r11d, %r11d - jmp .Le2 - -.L1: mov %r10, %r11 - lea -24(%rsi), %rsi - lea 40(%rdx), %rdx - lea 40(%rdi), %rdi - xor %r10d, %r10d - jmp .Le1 .size __gmpn_rsblsh_n,.-__gmpn_rsblsh_n - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab - .long .L4-.Ltab - .long .L5-.Ltab - .long .L6-.Ltab - .long .L7-.Ltab - diff --git a/ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s b/ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s index c385f661fc..8554f6f047 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s @@ -56,6 +56,8 @@ + + @@ -72,7 +74,6 @@ .text - .align 16, 0x90 .globl __gmpn_rsh1add_nc .type __gmpn_rsh1add_nc,@function @@ -82,12 +83,11 @@ __gmpn_rsh1add_nc: push %rbx - push %rbp + xor %eax, %eax neg %r8 - mov (%rsi), %rbp - adc (%rdx), %rbp - + mov (%rsi), %rbx + adc (%rdx), %rbx jmp .Lent .size __gmpn_rsh1add_nc,.-__gmpn_rsh1add_nc @@ -99,14 +99,13 @@ __gmpn_rsh1add_n: push %rbx - push %rbp - mov (%rsi), %rbp - add (%rdx), %rbp + xor %eax, %eax + mov (%rsi), %rbx + add (%rdx), %rbx .Lent: - sbb %ebx, %ebx - mov %ebp, %eax - and $1, %eax + rcr %rbx + adc %eax, %eax mov %ecx, %r11d and $3, %r11d @@ -116,21 +115,20 @@ __gmpn_rsh1add_n: .Ln1: cmp $2, %r11d jne .Ln2 - add %ebx, %ebx + add %rbx, %rbx mov 8(%rsi), %r10 adc 8(%rdx), %r10 lea 8(%rsi), %rsi lea 8(%rdx), %rdx lea 8(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r10, %rbp - mov %rbp, -8(%rdi) + rcr %r10 + rcr %rbx + mov %rbx, -8(%rdi) jmp .Lcj1 .Ln2: cmp $3, %r11d jne .Ln3 - add %ebx, %ebx + add %rbx, %rbx mov 8(%rsi), %r9 mov 16(%rsi), %r10 adc 8(%rdx), %r9 @@ -138,14 +136,14 @@ __gmpn_rsh1add_n: lea 16(%rsi), %rsi lea 16(%rdx), %rdx lea 16(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r9, %rbp - mov %rbp, -16(%rdi) + rcr %r10 + rcr %r9 + rcr %rbx + mov %rbx, -16(%rdi) jmp .Lcj2 -.Ln3: dec %rcx - add %ebx, %ebx +.Ln3: dec %rcx + add %rbx, %rbx mov 8(%rsi), %r8 mov 16(%rsi), %r9 adc 8(%rdx), %r8 @@ -155,21 +153,20 @@ __gmpn_rsh1add_n: lea 24(%rsi), %rsi lea 24(%rdx), %rdx lea 24(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r8, %rbp - mov %rbp, -24(%rdi) - shrd $1, %r9, %r8 + rcr %r10 + rcr %r9 + rcr %r8 + rcr %rbx + mov %rbx, -24(%rdi) mov %r8, -16(%rdi) -.Lcj2: shrd $1, %r10, %r9 - mov %r9, -8(%rdi) -.Lcj1: mov %r10, %rbp +.Lcj2: mov %r9, -8(%rdi) +.Lcj1: mov %r10, %rbx .Ldo: - shr $2, %rcx + shr $2, %rcx je .Lend .align 16, 0x90 -.Ltop: add %ebx, %ebx +.Ltop: add %rbx, %rbx mov 8(%rsi), %r8 mov 16(%rsi), %r9 @@ -183,25 +180,23 @@ __gmpn_rsh1add_n: lea 32(%rsi), %rsi lea 32(%rdx), %rdx - sbb %ebx, %ebx + rcr %r11 + rcr %r10 + rcr %r9 + rcr %r8 - shrd $1, %r8, %rbp - mov %rbp, (%rdi) - shrd $1, %r9, %r8 + rcr %rbx + mov %rbx, (%rdi) mov %r8, 8(%rdi) - shrd $1, %r10, %r9 mov %r9, 16(%rdi) - shrd $1, %r11, %r10 mov %r10, 24(%rdi) + mov %r11, %rbx - dec %rcx - mov %r11, %rbp lea 32(%rdi), %rdi + dec %rcx jne .Ltop -.Lend: shrd $1, %rbx, %rbp - mov %rbp, (%rdi) - pop %rbp +.Lend: mov %rbx, (%rdi) pop %rbx ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s b/ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s index 0d7ab328a6..ff06ece4bc 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s @@ -57,6 +57,8 @@ + + @@ -72,7 +74,6 @@ .text - .align 16, 0x90 .globl __gmpn_rsh1sub_nc .type __gmpn_rsh1sub_nc,@function @@ -82,12 +83,11 @@ __gmpn_rsh1sub_nc: push %rbx - push %rbp + xor %eax, %eax neg %r8 - mov (%rsi), %rbp - sbb (%rdx), %rbp - + mov (%rsi), %rbx + sbb (%rdx), %rbx jmp .Lent .size __gmpn_rsh1sub_nc,.-__gmpn_rsh1sub_nc @@ -99,14 +99,13 @@ __gmpn_rsh1sub_n: push %rbx - push %rbp - mov (%rsi), %rbp - sub (%rdx), %rbp + xor %eax, %eax + mov (%rsi), %rbx + sub (%rdx), %rbx .Lent: - sbb %ebx, %ebx - mov %ebp, %eax - and $1, %eax + rcr %rbx + adc %eax, %eax mov %ecx, %r11d and $3, %r11d @@ -116,21 +115,20 @@ __gmpn_rsh1sub_n: .Ln1: cmp $2, %r11d jne .Ln2 - add %ebx, %ebx + add %rbx, %rbx mov 8(%rsi), %r10 sbb 8(%rdx), %r10 lea 8(%rsi), %rsi lea 8(%rdx), %rdx lea 8(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r10, %rbp - mov %rbp, -8(%rdi) + rcr %r10 + rcr %rbx + mov %rbx, -8(%rdi) jmp .Lcj1 .Ln2: cmp $3, %r11d jne .Ln3 - add %ebx, %ebx + add %rbx, %rbx mov 8(%rsi), %r9 mov 16(%rsi), %r10 sbb 8(%rdx), %r9 @@ -138,14 +136,14 @@ __gmpn_rsh1sub_n: lea 16(%rsi), %rsi lea 16(%rdx), %rdx lea 16(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r9, %rbp - mov %rbp, -16(%rdi) + rcr %r10 + rcr %r9 + rcr %rbx + mov %rbx, -16(%rdi) jmp .Lcj2 -.Ln3: dec %rcx - add %ebx, %ebx +.Ln3: dec %rcx + add %rbx, %rbx mov 8(%rsi), %r8 mov 16(%rsi), %r9 sbb 8(%rdx), %r8 @@ -155,21 +153,20 @@ __gmpn_rsh1sub_n: lea 24(%rsi), %rsi lea 24(%rdx), %rdx lea 24(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r8, %rbp - mov %rbp, -24(%rdi) - shrd $1, %r9, %r8 + rcr %r10 + rcr %r9 + rcr %r8 + rcr %rbx + mov %rbx, -24(%rdi) mov %r8, -16(%rdi) -.Lcj2: shrd $1, %r10, %r9 - mov %r9, -8(%rdi) -.Lcj1: mov %r10, %rbp +.Lcj2: mov %r9, -8(%rdi) +.Lcj1: mov %r10, %rbx .Ldo: - shr $2, %rcx + shr $2, %rcx je .Lend .align 16, 0x90 -.Ltop: add %ebx, %ebx +.Ltop: add %rbx, %rbx mov 8(%rsi), %r8 mov 16(%rsi), %r9 @@ -183,25 +180,23 @@ __gmpn_rsh1sub_n: lea 32(%rsi), %rsi lea 32(%rdx), %rdx - sbb %ebx, %ebx + rcr %r11 + rcr %r10 + rcr %r9 + rcr %r8 - shrd $1, %r8, %rbp - mov %rbp, (%rdi) - shrd $1, %r9, %r8 + rcr %rbx + mov %rbx, (%rdi) mov %r8, 8(%rdi) - shrd $1, %r10, %r9 mov %r9, 16(%rdi) - shrd $1, %r11, %r10 mov %r10, 24(%rdi) + mov %r11, %rbx - dec %rcx - mov %r11, %rbp lea 32(%rdi), %rdi + dec %rcx jne .Ltop -.Lend: shrd $1, %rbx, %rbp - mov %rbp, (%rdi) - pop %rbp +.Lend: mov %rbx, (%rdi) pop %rbx ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/rshift.s b/ext/gmp/gen/x86_64-linux/mpn/rshift.s index 386eccd1ac..8ddd7b5557 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rshift.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rshift.s @@ -41,32 +41,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -89,142 +63,129 @@ .text - .align 64, 0x90 + .align 32, 0x90 .globl __gmpn_rshift .type __gmpn_rshift,@function __gmpn_rshift: - movd %ecx, %xmm4 - mov $64, %eax - sub %ecx, %eax - movd %eax, %xmm5 - - neg %ecx + neg %ecx mov (%rsi), %rax - shl %cl, %rax - - cmp $3, %rdx - jle .Lbc + shl %cl, %rax + neg %ecx - test $8, %dil - jz .Lrp_aligned - - - movq (%rsi), %xmm0 - movq 8(%rsi), %xmm1 - psrlq %xmm4, %xmm0 - psllq %xmm5, %xmm1 - por %xmm1, %xmm0 - movq %xmm0, (%rdi) - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - dec %rdx - -.Lrp_aligned: lea 1(%rdx), %r8d - lea (%rsi,%rdx,8), %rsi - lea (%rdi,%rdx,8), %rdi - neg %rdx - and $6, %r8d - jz .Lbu0 - cmp $4, %r8d - jz .Lbu4 - jc .Lbu2 -.Lbu6: add $4, %rdx - jmp .Li56 -.Lbu0: add $6, %rdx - jmp .Li70 -.Lbu4: add $2, %rdx - jmp .Li34 -.Lbu2: add $8, %rdx - jge .Lend + lea -8(%rsi,%rdx,8), %rsi + lea -8(%rdi,%rdx,8), %rdi + neg %rdx + and $3, %r8d + je .Lrlx + + dec %r8d + jne .L1 + + mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + shl %cl, %r8 + or %r8, %r10 + mov %r10, 8(%rdi,%rdx,8) + inc %rdx + jmp .Lrll + +.L1: dec %r8d + je .L1x + + mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + shl %cl, %r8 + or %r8, %r10 + mov %r10, 8(%rdi,%rdx,8) + inc %rdx + neg %ecx +.L1x: + cmp $-1, %rdx + je .Last + mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + mov 16(%rsi,%rdx,8), %r11 + shr %cl, %r11 + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + mov 24(%rsi,%rdx,8), %r9 + shl %cl, %r8 + or %r8, %r10 + shl %cl, %r9 + or %r9, %r11 + mov %r10, 8(%rdi,%rdx,8) + mov %r11, 16(%rdi,%rdx,8) + add $2, %rdx + +.Lrll: neg %ecx +.Lrlx: mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + mov 16(%rsi,%rdx,8), %r11 + shr %cl, %r11 + + add $4, %rdx + jb .Lend .align 16, 0x90 -.Ltop: movdqu -64(%rsi,%rdx,8), %xmm1 - movdqu -56(%rsi,%rdx,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -64(%rdi,%rdx,8) -.Li70: - movdqu -48(%rsi,%rdx,8), %xmm1 - movdqu -40(%rsi,%rdx,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -48(%rdi,%rdx,8) -.Li56: - movdqu -32(%rsi,%rdx,8), %xmm1 - movdqu -24(%rsi,%rdx,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -32(%rdi,%rdx,8) -.Li34: - movdqu -16(%rsi,%rdx,8), %xmm1 - movdqu -8(%rsi,%rdx,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -16(%rdi,%rdx,8) - add $8, %rdx - jl .Ltop - -.Lend: test $1, %dl - jnz .Le1 - - movdqu -16(%rsi), %xmm1 - movq -8(%rsi), %xmm0 - psrlq %xmm4, %xmm1 - psllq %xmm5, %xmm0 - por %xmm1, %xmm0 - movdqa %xmm0, -16(%rdi) +.Ltop: - ret - -.Le1: movq -8(%rsi), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, -8(%rdi) + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + mov -8(%rsi,%rdx,8), %r9 + shl %cl, %r8 + or %r8, %r10 + shl %cl, %r9 + or %r9, %r11 + mov %r10, -24(%rdi,%rdx,8) + mov %r11, -16(%rdi,%rdx,8) - ret + mov (%rsi,%rdx,8), %r8 + mov 8(%rsi,%rdx,8), %r9 + shl %cl, %r8 + shl %cl, %r9 - - .align 16, 0x90 -.Lbc: dec %edx - jnz 1f - movq (%rsi), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, (%rdi) - ret - -1: movq (%rsi), %xmm1 - movq 8(%rsi), %xmm0 - psrlq %xmm4, %xmm1 - psllq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, (%rdi) - dec %edx - jnz 1f - movq 8(%rsi), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, 8(%rdi) + neg %ecx + mov -8(%rsi,%rdx,8), %r10 + mov 0(%rsi,%rdx,8), %r11 + shr %cl, %r10 + or %r10, %r8 + shr %cl, %r11 + or %r11, %r9 + mov %r8, -8(%rdi,%rdx,8) + mov %r9, 0(%rdi,%rdx,8) - ret - -1: movq 8(%rsi), %xmm1 - movq 16(%rsi), %xmm0 - psrlq %xmm4, %xmm1 - psllq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, 8(%rdi) - movq 16(%rsi), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, 16(%rdi) + mov 8(%rsi,%rdx,8), %r10 + mov 16(%rsi,%rdx,8), %r11 + shr %cl, %r10 + shr %cl, %r11 + + add $4, %rdx + jae .Ltop +.Lend: + neg %ecx + mov -8(%rsi), %r8 + shl %cl, %r8 + or %r8, %r10 + mov (%rsi), %r9 + shl %cl, %r9 + or %r9, %r11 + mov %r10, -16(%rdi) + mov %r11, -8(%rdi) + + neg %ecx +.Last: mov (%rsi), %r10 + shr %cl, %r10 + mov %r10, (%rdi) ret .size __gmpn_rshift,.-__gmpn_rshift - diff --git a/ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s b/ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s index 6e67f45c31..7a50a70410 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s +++ b/ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s @@ -62,21 +62,6 @@ - - - - - - - - - - - - - - - @@ -103,131 +88,103 @@ __gmpn_sec_tabselect: - - - - - - - movd %r8, %xmm8 - pshufd $0, %xmm8, %xmm8 - mov $1, %eax - movd %rax, %xmm9 - pshufd $0, %xmm9, %xmm9 + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 mov %rdx, %r9 - add $-8, %r9 + add $-4, %r9 js .Louter_end .Louter_top: - mov %rcx, %r10 - mov %rsi, %r11 - pxor %xmm1, %xmm1 - pxor %xmm4, %xmm4 - pxor %xmm5, %xmm5 - pxor %xmm6, %xmm6 - pxor %xmm7, %xmm7 + mov %rcx, %rbp + push %rsi + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d + mov %r8, %rbx + .align 16, 0x90 -.Ltop: movdqa %xmm8, %xmm0 - pcmpeqd %xmm1, %xmm0 - paddd %xmm9, %xmm1 - movdqu 0(%rsi), %xmm2 - movdqu 16(%rsi), %xmm3 - pand %xmm0, %xmm2 - pand %xmm0, %xmm3 - por %xmm2, %xmm4 - por %xmm3, %xmm5 - movdqu 32(%rsi), %xmm2 - movdqu 48(%rsi), %xmm3 - pand %xmm0, %xmm2 - pand %xmm0, %xmm3 - por %xmm2, %xmm6 - por %xmm3, %xmm7 +.Ltop: sub $1, %rbx + sbb %rax, %rax + mov 0(%rsi), %r10 + mov 8(%rsi), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r12 + or %r11, %r13 + mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r14 + or %r11, %r15 lea (%rsi,%rdx,8), %rsi - add $-1, %r10 + add $-1, %rbp jne .Ltop - movdqu %xmm4, 0(%rdi) - movdqu %xmm5, 16(%rdi) - movdqu %xmm6, 32(%rdi) - movdqu %xmm7, 48(%rdi) - - lea 64(%r11), %rsi - lea 64(%rdi), %rdi - add $-8, %r9 + mov %r12, 0(%rdi) + mov %r13, 8(%rdi) + mov %r14, 16(%rdi) + mov %r15, 24(%rdi) + pop %rsi + lea 32(%rsi), %rsi + lea 32(%rdi), %rdi + add $-4, %r9 jns .Louter_top .Louter_end: - test $4, %dl - je .Lb0xx -.Lb1xx:mov %rcx, %r10 - mov %rsi, %r11 - pxor %xmm1, %xmm1 - pxor %xmm4, %xmm4 - pxor %xmm5, %xmm5 + test $2, %dl + jz .Lb0x +.Lb1x: mov %rcx, %rbp + push %rsi + xor %r12d, %r12d + xor %r13d, %r13d + mov %r8, %rbx .align 16, 0x90 -.Ltp4: movdqa %xmm8, %xmm0 - pcmpeqd %xmm1, %xmm0 - paddd %xmm9, %xmm1 - movdqu 0(%rsi), %xmm2 - movdqu 16(%rsi), %xmm3 - pand %xmm0, %xmm2 - pand %xmm0, %xmm3 - por %xmm2, %xmm4 - por %xmm3, %xmm5 +.Ltp2: sub $1, %rbx + sbb %rax, %rax + mov 0(%rsi), %r10 + mov 8(%rsi), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r12 + or %r11, %r13 lea (%rsi,%rdx,8), %rsi - add $-1, %r10 - jne .Ltp4 - movdqu %xmm4, 0(%rdi) - movdqu %xmm5, 16(%rdi) - lea 32(%r11), %rsi - lea 32(%rdi), %rdi - -.Lb0xx:test $2, %dl - je .Lb00x -.Lb01x:mov %rcx, %r10 - mov %rsi, %r11 - pxor %xmm1, %xmm1 - pxor %xmm4, %xmm4 - .align 16, 0x90 -.Ltp2: movdqa %xmm8, %xmm0 - pcmpeqd %xmm1, %xmm0 - paddd %xmm9, %xmm1 - movdqu 0(%rsi), %xmm2 - pand %xmm0, %xmm2 - por %xmm2, %xmm4 - lea (%rsi,%rdx,8), %rsi - add $-1, %r10 + add $-1, %rbp jne .Ltp2 - movdqu %xmm4, 0(%rdi) - lea 16(%r11), %rsi + mov %r12, 0(%rdi) + mov %r13, 8(%rdi) + pop %rsi + lea 16(%rsi), %rsi lea 16(%rdi), %rdi -.Lb00x:test $1, %dl - je .Lb000 -.Lb001:mov %rcx, %r10 - mov %rsi, %r11 - pxor %xmm1, %xmm1 - pxor %xmm4, %xmm4 +.Lb0x: test $1, %dl + jz .Lb00 +.Lb01: mov %rcx, %rbp + xor %r12d, %r12d + mov %r8, %rbx .align 16, 0x90 -.Ltp1: movdqa %xmm8, %xmm0 - pcmpeqd %xmm1, %xmm0 - paddd %xmm9, %xmm1 - movq 0(%rsi), %xmm2 - pand %xmm0, %xmm2 - por %xmm2, %xmm4 +.Ltp1: sub $1, %rbx + sbb %rax, %rax + mov 0(%rsi), %r10 + and %rax, %r10 + or %r10, %r12 lea (%rsi,%rdx,8), %rsi - add $-1, %r10 + add $-1, %rbp jne .Ltp1 - movq %xmm4, 0(%rdi) - -.Lb000: - - - - - + mov %r12, 0(%rdi) + +.Lb00: pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx ret .size __gmpn_sec_tabselect,.-__gmpn_sec_tabselect - diff --git a/ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s b/ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s index 26efdaa53a..eb24851327 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s +++ b/ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s @@ -89,6 +89,11 @@ + + + + + @@ -103,746 +108,711 @@ __gmpn_sqr_basecase: + mov %edx, %ecx + mov %edx, %r11d + + add $-40, %rsp + + and $3, %ecx + cmp $4, %edx + lea 4(%rcx), %r8 - cmp $2, %rdx - jae .Lgt1 + mov %rbx, 32(%rsp) + mov %rbp, 24(%rsp) + mov %r12, 16(%rsp) + mov %r13, 8(%rsp) + mov %r14, (%rsp) + + cmovg %r8, %rcx + + lea .Ltab(%rip), %rax + movslq (%rax,%rcx,4), %r10 + add %r10, %rax + jmp *%rax + + .section .data.rel.ro.local,"a",@progbits + .align 8, 0x90 +.Ltab: .long .L4-.Ltab + .long .L1-.Ltab + .long .L2-.Ltab + .long .L3-.Ltab + .long .L0m4-.Ltab + .long .L1m4-.Ltab + .long .L2m4-.Ltab + .long .L3m4-.Ltab + .text - mov (%rsi), %rdx - .byte 0xc4,226,251,0xf6,210 +.L1: mov (%rsi), %rax + mul %rax + add $40, %rsp mov %rax, (%rdi) mov %rdx, 8(%rdi) ret -.Lgt1: jne .Lgt2 - - mov (%rsi), %rdx - mov 8(%rsi), %rcx - .byte 0xc4,98,179,0xf6,209 - .byte 0xc4,98,251,0xf6,194 - mov %rcx, %rdx - .byte 0xc4,226,163,0xf6,210 - add %r9, %r9 - adc %r10, %r10 - adc $0, %rdx - add %r9, %r8 - adc %r11, %r10 - adc $0, %rdx +.L2: mov (%rsi), %rax + mov %rax, %r8 + mul %rax + mov 8(%rsi), %r11 mov %rax, (%rdi) - mov %r8, 8(%rdi) + mov %r11, %rax + mov %rdx, %r9 + mul %rax + add $40, %rsp + mov %rax, %r10 + mov %r11, %rax + mov %rdx, %r11 + mul %r8 + xor %r8, %r8 + add %rax, %r9 + adc %rdx, %r10 + adc %r8, %r11 + add %rax, %r9 + mov %r9, 8(%rdi) + adc %rdx, %r10 mov %r10, 16(%rdi) + adc %r8, %r11 + mov %r11, 24(%rdi) + + ret + +.L3: mov (%rsi), %rax + mov %rax, %r10 + mul %rax + mov 8(%rsi), %r11 + mov %rax, (%rdi) + mov %r11, %rax + mov %rdx, 8(%rdi) + mul %rax + mov 16(%rsi), %rcx + mov %rax, 16(%rdi) + mov %rcx, %rax mov %rdx, 24(%rdi) + mul %rax + mov %rax, 32(%rdi) + mov %rdx, 40(%rdi) + + mov %r11, %rax + mul %r10 + mov %rax, %r8 + mov %rcx, %rax + mov %rdx, %r9 + mul %r10 + xor %r10, %r10 + add %rax, %r9 + mov %r11, %rax + mov %r10, %r11 + adc %rdx, %r10 + + mul %rcx + add $40, %rsp + add %rax, %r10 + adc %r11, %rdx + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %rdx, %rdx + adc %r11, %r11 + add %r8, 8(%rdi) + adc %r9, 16(%rdi) + adc %r10, 24(%rdi) + adc %rdx, 32(%rdi) + adc %r11, 40(%rdi) ret -.Lgt2: cmp $4, %rdx - jae .Lgt3 - - push %rbx - mov (%rsi), %rdx - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xc4,98,187,0xf6,78,16 - add %r11, %r8 - mov 8(%rsi), %rdx - .byte 0xc4,98,251,0xf6,94,16 - adc %rax, %r9 - adc $0, %r11 - test %ebx, %ebx - mov (%rsi), %rdx - .byte 0xc4,226,227,0xf6,202 - mov %rbx, (%rdi) - mov 8(%rsi), %rdx - .byte 0xc4,226,251,0xf6,218 - mov 16(%rsi), %rdx - .byte 0xc4,226,203,0xf6,210 - .byte 0x66,77,0x0f,0x38,0xf6,210 - .byte 0x66,77,0x0f,0x38,0xf6,192 - .byte 0x66,77,0x0f,0x38,0xf6,201 - .byte 0x66,77,0x0f,0x38,0xf6,219 - .byte 0xf3,73,0x0f,0x38,0xf6,202 - .byte 0xf3,73,0x0f,0x38,0xf6,192 - .byte 0xf3,73,0x0f,0x38,0xf6,217 - .byte 0xf3,73,0x0f,0x38,0xf6,243 - mov $0, %r8d - .byte 0xf3,73,0x0f,0x38,0xf6,208 - .byte 0x66,73,0x0f,0x38,0xf6,208 - mov %rcx, 8(%rdi) +.L4: mov (%rsi), %rax + mov %rax, %r11 + mul %rax + mov 8(%rsi), %rbx + mov %rax, (%rdi) + mov %rbx, %rax + mov %rdx, 8(%rdi) + mul %rax mov %rax, 16(%rdi) - mov %rbx, 24(%rdi) - mov %rsi, 32(%rdi) + mov %rdx, 24(%rdi) + mov 16(%rsi), %rax + mul %rax + mov %rax, 32(%rdi) mov %rdx, 40(%rdi) + mov 24(%rsi), %rax + mul %rax + mov %rax, 48(%rdi) + mov %rbx, %rax + mov %rdx, 56(%rdi) + + mul %r11 + add $32, %rsp + mov %rax, %r8 + mov %rdx, %r9 + mov 16(%rsi), %rax + mul %r11 + xor %r10, %r10 + add %rax, %r9 + adc %rdx, %r10 + mov 24(%rsi), %rax + mul %r11 + xor %r11, %r11 + add %rax, %r10 + adc %rdx, %r11 + mov 16(%rsi), %rax + mul %rbx + xor %rcx, %rcx + add %rax, %r10 + adc %rdx, %r11 + adc $0, %rcx + mov 24(%rsi), %rax + mul %rbx pop %rbx + add %rax, %r11 + adc %rdx, %rcx + mov 16(%rsi), %rdx + mov 24(%rsi), %rax + mul %rdx + add %rax, %rcx + adc $0, %rdx + + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %r11, %r11 + adc %rcx, %rcx + mov $0, %eax + adc %rdx, %rdx + + adc %rax, %rax + add %r8, 8(%rdi) + adc %r9, 16(%rdi) + adc %r10, 24(%rdi) + adc %r11, 32(%rdi) + adc %rcx, 40(%rdi) + adc %rdx, 48(%rdi) + adc %rax, 56(%rdi) ret -.Lgt3: push %rbx - - lea -3(%rdx), %ebx - lea 5(%rdx), %ecx - mov %edx, %eax - and $-8, %ebx - shr $3, %ecx - neg %rbx - and $7, %eax - mov (%rsi), %rdx +.L0m4: + lea -16(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea (%rsi,%r11,8), %rsi - lea .Lmtab(%rip), %r10 - movslq (%r10,%rax,4), %r8 - lea (%r8, %r10), %r10 - jmp *%r10 + lea -4(%r11), %r8 + xor %r9d, %r9d + sub %r11, %r9 -.Lmf0: .byte 0xc4,98,187,0xf6,202 - add %rdx, %rdx - .byte 0xc4,98,171,0xf6,94,8 - lea 64(%rsi), %rsi - add %r9, %r10 - jmp .Lmb0 - -.Lmf3: .byte 0xc4,98,171,0xf6,218 - add %rdx, %rdx - mov %r10, (%rdi) - .byte 0xc4,98,187,0xf6,78,8 - lea 24(%rsi), %rsi - lea 24(%rdi), %rdi - add %r11, %r8 - jmp .Lmb3 - -.Lmf4: .byte 0xc4,98,187,0xf6,202 - add %rdx, %rdx - .byte 0xc4,98,171,0xf6,94,8 - mov %r8, (%rdi) - lea 32(%rsi), %rsi - lea 32(%rdi), %rdi - add %r9, %r10 - jmp .Lmb4 - -.Lmf5: .byte 0xc4,98,171,0xf6,218 - add %rdx, %rdx - .byte 0xc4,98,187,0xf6,78,8 - mov %r10, (%rdi) - lea 40(%rsi), %rsi - lea 40(%rdi), %rdi - add %r11, %r8 - jmp .Lmb5 - -.Lmf6: .byte 0xc4,98,187,0xf6,202 - add %rdx, %rdx - .byte 0xc4,98,171,0xf6,94,8 - mov %r8, (%rdi) - lea 48(%rsi), %rsi - lea 48(%rdi), %rdi - add %r9, %r10 - jmp .Lmb6 - -.Lmf7: .byte 0xc4,98,171,0xf6,218 - add %rdx, %rdx - .byte 0xc4,98,187,0xf6,78,8 - mov %r10, (%rdi) - lea 56(%rsi), %rsi - lea 56(%rdi), %rdi - add %r11, %r8 - jmp .Lmb7 - -.Lmf1: .byte 0xc4,98,171,0xf6,218 - add %rdx, %rdx - .byte 0xc4,98,187,0xf6,78,8 - mov %r10, (%rdi) - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - add %r11, %r8 - jmp .Lmb1 - -.Lmf2: .byte 0xc4,98,187,0xf6,202 - add %rdx, %rdx - .byte 0xc4,98,171,0xf6,94,8 - mov %r8, (%rdi) - lea 16(%rsi), %rsi - lea 16(%rdi), %rdi - dec %ecx - add %r9, %r10 - .byte 0xc4,98,187,0xf6,14 - - .align 16, 0x90 -.Ltop: mov %r10, -8(%rdi) - adc %r11, %r8 -.Lmb1: .byte 0xc4,98,171,0xf6,94,8 - adc %r9, %r10 - lea 64(%rsi), %rsi -.Lmb0: mov %r8, (%rdi) - mov %r10, 8(%rdi) - .byte 0xc4,98,187,0xf6,78,208 - lea 64(%rdi), %rdi - adc %r11, %r8 -.Lmb7: .byte 0xc4,98,171,0xf6,94,216 - mov %r8, -48(%rdi) - adc %r9, %r10 -.Lmb6: mov %r10, -40(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - adc %r11, %r8 -.Lmb5: .byte 0xc4,98,171,0xf6,94,232 - mov %r8, -32(%rdi) - adc %r9, %r10 -.Lmb4: .byte 0xc4,98,187,0xf6,78,240 - mov %r10, -24(%rdi) - adc %r11, %r8 -.Lmb3: .byte 0xc4,98,171,0xf6,94,248 - adc %r9, %r10 - mov %r8, -16(%rdi) - dec %ecx - .byte 0xc4,98,187,0xf6,14 - jnz .Ltop - -.Lend: mov %r10, -8(%rdi) - adc %r11, %r8 - - - - - lea .Latab(%rip), %r10 - movslq (%r10,%rax,4), %r11 - lea (%r11, %r10), %r11 - - mov $63, %eax - jmp *%r11 - -.Led0: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf7: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea -64(%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov (%rsi), %r9 - mov 8(%rsi), %rdx - .byte 0xc4,66,251,0xf7,193 - .byte 0xc4,66,250,0xf7,201 - and %rdx, %r9 - .byte 0xc4,98,171,0xf6,218 - lea (%r8,%rdx,2), %rdx - jmp .Lb7 - - .align 16, 0x90 -.Ltp0: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led0 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx -.Lb0: mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp0 - -.Led1: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf0: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea -64(%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov -8(%rsi), %r11 - mov (%rsi), %rdx - .byte 0xc4,66,251,0xf7,211 - .byte 0xc4,66,250,0xf7,219 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - lea (%r10,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - jmp .Lb0 + mul %r13 + xor %ebp, %ebp + mov %rax, %rbx + mov 16(%rsi,%r9,8), %rax + mov %rdx, %r10 + jmp .LL3 .align 16, 0x90 -.Ltp1: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led1 -.Lb1: .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp1 - -.Led2: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf1: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea 8(%rbx), %rbx - lea -56(%rdi,%rbx,8), %rdi - mov -16(%rsi), %r9 - mov -8(%rsi), %rdx - .byte 0xc4,66,251,0xf7,193 - .byte 0xc4,66,250,0xf7,201 - and %rdx, %r9 - .byte 0xc4,98,171,0xf6,218 - lea (%r8,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jmp .Lb1 +.Lmul_1_m3_top: + add %rax, %rbp + mov %r10, (%r12,%r9,8) + mov (%rsi,%r9,8), %rax + adc %rdx, %rcx + xor %ebx, %ebx + mul %r13 + xor %r10d, %r10d + mov %rbp, 8(%r12,%r9,8) + add %rax, %rcx + adc %rdx, %rbx + mov 8(%rsi,%r9,8), %rax + mov %rcx, 16(%r12,%r9,8) + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %r10 +.LL3: xor %ecx, %ecx + mul %r13 + add %rax, %r10 + mov 24(%rsi,%r9,8), %rax + adc %rdx, %rbp + mov %rbx, 24(%r12,%r9,8) + mul %r13 + add $4, %r9 + js .Lmul_1_m3_top + + add %rax, %rbp + mov %r10, (%r12) + adc %rdx, %rcx + mov %rbp, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 + lea -8(%rsi), %rsi + jmp .Ldowhile + + +.L1m4: + lea 8(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea 8(%rsi,%r11,8), %rsi + + lea -3(%r11), %r8 + + lea -3(%r11), %r9 + neg %r9 + + mov %rax, %r14 + mul %r13 + mov %rdx, %rcx + xor %ebp, %ebp + mov %rax, 8(%rdi) + jmp .Lm0 .align 16, 0x90 -.Ltp2: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led2 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) -.Lb2: .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp2 - -.Led3: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf2: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - or %ebx, %ecx - jz .Lcor3 - lea -56(%rdi,%rbx,8), %rdi - mov -24(%rsi), %r11 - mov -16(%rsi), %rdx - .byte 0xc4,66,251,0xf7,211 - .byte 0xc4,66,250,0xf7,219 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - lea (%r10,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,195 - jmp .Lb2 +.Lmul_2_m0_top: + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov -24(%rsi,%r9,8), %rax + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rax, %rcx + mov %rbx, -24(%r12,%r9,8) + adc %rdx, %rbp +.Lm0: mov -16(%rsi,%r9,8), %rax + mul %r13 + mov $0, %r10d + add %rax, %rcx + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + adc $0, %r10d + mov $0, %ebx + mov %rcx, -16(%r12,%r9,8) + mul %r14 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mov $0, %ecx + mul %r13 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r14 + add %rax, %r10 + mov %rbp, -8(%r12,%r9,8) + adc %rdx, %rbx +.Lm2x: mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + add $4, %r9 + mov -32(%rsi,%r9,8), %rax + mov %r10, -32(%r12,%r9,8) + js .Lmul_2_m0_top + + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov %rbx, -8(%r12) + mov %rcx, (%r12) + + lea -16(%rsi), %rsi + lea 0(%r12), %r12 + jmp .Ldowhile_end + + +.L2m4: + lea -16(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea (%rsi,%r11,8), %rsi + + lea -4(%r11), %r8 + + lea -2(%r11), %r9 + neg %r9 + + mul %r13 + mov %rax, %rbp + mov (%rsi,%r9,8), %rax + mov %rdx, %rcx + jmp .LL1 .align 16, 0x90 -.Ltp3: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led3 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) -.Lb3: .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp3 - -.Led4: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf3: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov -32(%rsi), %r9 - mov -24(%rsi), %rdx - .byte 0xc4,66,251,0xf7,193 - .byte 0xc4,66,250,0xf7,201 - and %rdx, %r9 - .byte 0xc4,98,171,0xf6,218 - lea (%r8,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,209 - jmp .Lb3 +.Lmul_1_m1_top: + add %rax, %rbp + mov %r10, (%r12,%r9,8) + mov (%rsi,%r9,8), %rax + adc %rdx, %rcx +.LL1: xor %ebx, %ebx + mul %r13 + xor %r10d, %r10d + mov %rbp, 8(%r12,%r9,8) + add %rax, %rcx + adc %rdx, %rbx + mov 8(%rsi,%r9,8), %rax + mov %rcx, 16(%r12,%r9,8) + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %r10 + xor %ecx, %ecx + mul %r13 + add %rax, %r10 + mov 24(%rsi,%r9,8), %rax + adc %rdx, %rbp + mov %rbx, 24(%r12,%r9,8) + mul %r13 + add $4, %r9 + js .Lmul_1_m1_top + + add %rax, %rbp + mov %r10, (%r12) + adc %rdx, %rcx + mov %rbp, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 + lea -8(%rsi), %rsi + jmp .Ldowhile_mid + + +.L3m4: + lea 8(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea 8(%rsi,%r11,8), %rsi + + lea -5(%r11), %r8 + + lea -1(%r11), %r9 + neg %r9 + + mov %rax, %r14 + mul %r13 + mov %rdx, %r10 + xor %ebx, %ebx + xor %ecx, %ecx + mov %rax, 8(%rdi) + jmp .Lm2 .align 16, 0x90 -.Ltp4: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led4 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) -.Lb4: .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp4 - -.Led5: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf4: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov -40(%rsi), %r11 - mov -32(%rsi), %rdx - .byte 0xc4,66,251,0xf7,211 - .byte 0xc4,66,250,0xf7,219 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - lea (%r10,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,195 - jmp .Lb4 +.Lmul_2_m2_top: + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov -24(%rsi,%r9,8), %rax + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rax, %rcx + mov %rbx, -24(%r12,%r9,8) + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + mul %r13 + mov $0, %r10d + add %rax, %rcx + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + adc $0, %r10d + mov $0, %ebx + mov %rcx, -16(%r12,%r9,8) + mul %r14 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mov $0, %ecx + mul %r13 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r14 + add %rax, %r10 + mov %rbp, -8(%r12,%r9,8) + adc %rdx, %rbx +.Lm2: mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + add $4, %r9 + mov -32(%rsi,%r9,8), %rax + mov %r10, -32(%r12,%r9,8) + js .Lmul_2_m2_top + + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov %rbx, -8(%r12) + mov %rcx, (%r12) + + lea -16(%rsi), %rsi + jmp .Ldowhile_mid + +.Ldowhile: + + lea 4(%r8), %r9 + neg %r9 + + mov 16(%rsi,%r9,8), %r13 + mov 24(%rsi,%r9,8), %r14 + mov 24(%rsi,%r9,8), %rax + mul %r13 + xor %r10d, %r10d + add %rax, 24(%r12,%r9,8) + adc %rdx, %r10 + xor %ebx, %ebx + xor %ecx, %ecx + jmp .Lam2 .align 16, 0x90 -.Ltp5: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led5 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) -.Lb5: .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp5 - -.Led6: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf5: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov -48(%rsi), %r9 - mov -40(%rsi), %rdx - .byte 0xc4,66,251,0xf7,193 - .byte 0xc4,66,250,0xf7,201 - and %rdx, %r9 - .byte 0xc4,98,171,0xf6,218 - lea (%r8,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,209 - jmp .Lb5 +.Laddmul_2_m2_top: + add %r10, (%r12,%r9,8) + adc %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rbx, 8(%r12,%r9,8) + adc %rax, %rcx + adc %rdx, %rbp + mov 16(%rsi,%r9,8), %rax + mov $0, %r10d + mul %r13 + add %rax, %rcx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r14 + add %rcx, 16(%r12,%r9,8) + adc %rax, %rbp + mov 24(%rsi,%r9,8), %rax + adc %rdx, %r10 + mul %r13 + mov $0, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %ecx + mov 24(%rsi,%r9,8), %rax + adc $0, %ebx + mul %r14 + add %rbp, 24(%r12,%r9,8) + adc %rax, %r10 + adc %rdx, %rbx +.Lam2: mov 32(%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + mov 32(%rsi,%r9,8), %rax + adc %rdx, %rbx + adc $0, %ecx + mul %r14 + add $4, %r9 + js .Laddmul_2_m2_top + + add %r10, (%r12) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 + + add $-2, %r8d + +.Ldowhile_mid: + + lea 2(%r8), %r9 + neg %r9 + + mov (%rsi,%r9,8), %r13 + mov 8(%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %rax + mul %r13 + xor %ecx, %ecx + add %rax, 8(%r12,%r9,8) + adc %rdx, %rcx + xor %ebp, %ebp + jmp .L20 .align 16, 0x90 -.Ltp6: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led6 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi -.Lb6: .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp6 - -.Led7: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf6: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov -56(%rsi), %r11 - mov -48(%rsi), %rdx - .byte 0xc4,66,251,0xf7,211 - .byte 0xc4,66,250,0xf7,219 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - lea (%r10,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xc4,98,171,0xf6,94,216 - jmp .Lb6 +.Laddmul_2_m0_top: + add %r10, (%r12,%r9,8) + adc %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rbx, 8(%r12,%r9,8) + adc %rax, %rcx + adc %rdx, %rbp +.L20: mov 16(%rsi,%r9,8), %rax + mov $0, %r10d + mul %r13 + add %rax, %rcx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r14 + add %rcx, 16(%r12,%r9,8) + adc %rax, %rbp + mov 24(%rsi,%r9,8), %rax + adc %rdx, %r10 + mul %r13 + mov $0, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %ecx + mov 24(%rsi,%r9,8), %rax + adc $0, %ebx + mul %r14 + add %rbp, 24(%r12,%r9,8) + adc %rax, %r10 + adc %rdx, %rbx + mov 32(%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + mov 32(%rsi,%r9,8), %rax + adc %rdx, %rbx + adc $0, %ecx + mul %r14 + add $4, %r9 + js .Laddmul_2_m0_top + + add %r10, (%r12) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 +.Ldowhile_end: + + add $-2, %r8d + jne .Ldowhile + + + mov -16(%rsi), %r13 + mov -8(%rsi), %r14 + mov -8(%rsi), %rax + mul %r13 + xor %r10d, %r10d + add %rax, -8(%r12) + adc %rdx, %r10 + xor %ebx, %ebx + xor %ecx, %ecx + mov (%rsi), %rax + mul %r13 + add %rax, %r10 + mov (%rsi), %rax + adc %rdx, %rbx + mul %r14 + add %r10, (%r12) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%r12) + mov %rcx, 16(%r12) + + + lea -4(%r11,%r11), %r9 + + mov 8(%rdi), %r11 + lea -8(%rsi), %rsi + lea (%rdi,%r9,8), %rdi + neg %r9 + mov (%rsi,%r9,4), %rax + mul %rax + test $2, %r9b + jnz .Lodd + +.Levn: add %r11, %r11 + sbb %ebx, %ebx + add %rdx, %r11 + mov %rax, (%rdi,%r9,8) + jmp .Ld0 + +.Lodd: add %r11, %r11 + sbb %ebp, %ebp + add %rdx, %r11 + mov %rax, (%rdi,%r9,8) + lea -2(%r9), %r9 + jmp .Ld1 .align 16, 0x90 -.Ltp7: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led7 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) -.Lb7: .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp7 - -.Lcor3:lea -64(%rdi), %rdi - mov -24(%rsi), %r11 - mov -16(%rsi), %rdx - .byte 0xc4,66,251,0xf7,211 - .byte 0xc4,66,250,0xf7,219 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - lea (%r10,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,71,56 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 56(%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,227,0xf6,14 - .byte 0xf3,76,0x0f,0x38,0xf6,87,64 - .byte 0x66,73,0x0f,0x38,0xf6,219 - mov %r10, 64(%rdi) - .byte 0xf3,72,0x0f,0x38,0xf6,95,72 - .byte 0xf3,76,0x0f,0x38,0xf6,201 - adc %rcx, %r9 - mov %r9, 80(%rdi) - - mov -16(%rsi), %r9 - mov -8(%rsi), %rdx - .byte 0xc4,66,251,0xf7,193 - .byte 0xc4,66,250,0xf7,201 - and %rdx, %r9 - .byte 0xc4,98,171,0xf6,218 - lea (%r8,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,226,187,0xf6,6 - .byte 0xf3,76,0x0f,0x38,0xf6,211 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 72(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,80 - .byte 0xf3,72,0x0f,0x38,0xf6,193 - mov %r8, 80(%rdi) - adc %rcx, %rax - - mov -8(%rsi), %r11 - mov (%rsi), %rdx - sar $63, %r11 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,192 - mov %r8, 88(%rdi) - .byte 0x66,76,0x0f,0x38,0xf6,201 - .byte 0xf3,76,0x0f,0x38,0xf6,201 - mov %r9, 96(%rdi) +.Ltop: mov (%rsi,%r9,4), %rax + mul %rax + add %ebp, %ebp + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (%rdi,%r9,8) +.Ld0: mov %r11, 8(%rdi,%r9,8) + mov 16(%rdi,%r9,8), %r10 + adc %r10, %r10 + mov 24(%rdi,%r9,8), %r11 + adc %r11, %r11 + nop + sbb %ebp, %ebp + mov 8(%rsi,%r9,4), %rax + mul %rax + add %ebx, %ebx + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, 16(%rdi,%r9,8) +.Ld1: mov %r11, 24(%rdi,%r9,8) + mov 32(%rdi,%r9,8), %r10 + adc %r10, %r10 + mov 40(%rdi,%r9,8), %r11 + adc %r11, %r11 + sbb %ebx, %ebx + add $4, %r9 + js .Ltop + + mov (%rsi), %rax + mul %rax + add %ebp, %ebp + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (%rdi) + mov %r11, 8(%rdi) + mov 16(%rdi), %r10 + adc %r10, %r10 + sbb %ebp, %ebp + neg %ebp + mov 8(%rsi), %rax + mul %rax + add %ebx, %ebx + adc %rax, %r10 + adc %rbp, %rdx + mov %r10, 16(%rdi) + mov %rdx, 24(%rdi) + pop %r14 + pop %r13 + pop %r12 + pop %rbp pop %rbx ret - - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Lmtab:.long .Lmf7-.Lmtab - .long .Lmf0-.Lmtab - .long .Lmf1-.Lmtab - .long .Lmf2-.Lmtab - .long .Lmf3-.Lmtab - .long .Lmf4-.Lmtab - .long .Lmf5-.Lmtab - .long .Lmf6-.Lmtab -.Latab:.long .Lf6-.Latab - .long .Lf7-.Latab - .long .Lf0-.Latab - .long .Lf1-.Latab - .long .Lf2-.Latab - .long .Lf3-.Latab - .long .Lf4-.Latab - .long .Lf5-.Latab - .text .size __gmpn_sqr_basecase,.-__gmpn_sqr_basecase diff --git a/ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s b/ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s index 7db64b894e..cbef8af042 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s @@ -189,20 +189,20 @@ __gmpn_sub_err1_n: .align 32, 0x90 .Lloop: - mov (%rsi,%r9,8), %r14 shr $1, %al mov -8(%r8), %r10 mov $0, %r13d + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 sbb (%rdx,%r9,8), %r14 cmovnc %r13, %r10 - mov %r14, (%rdi,%r9,8) - mov 8(%rsi,%r9,8), %r15 - mov 16(%rsi,%r9,8), %r14 sbb 8(%rdx,%r9,8), %r15 mov -16(%r8), %r11 + mov %r14, (%rdi,%r9,8) + mov 16(%rsi,%r9,8), %r14 + mov %r15, 8(%rdi,%r9,8) cmovnc %r13, %r11 mov -24(%r8), %r12 - mov %r15, 8(%rdi,%r9,8) sbb 16(%rdx,%r9,8), %r14 cmovnc %r13, %r12 mov 24(%rsi,%r9,8), %r15 @@ -215,12 +215,12 @@ __gmpn_sub_err1_n: adc $0, %rbp add %r12, %rbx adc $0, %rbp - lea -32(%r8), %r8 mov %r14, 16(%rdi,%r9,8) add %r13, %rbx + lea -32(%r8), %r8 adc $0, %rbp + mov %r15, 24(%rdi,%r9,8) add $4, %r9 - mov %r15, -8(%rdi,%r9,8) jnz .Lloop .Lend: diff --git a/ext/gmp/gen/x86_64-linux/mpn/sub_n.s b/ext/gmp/gen/x86_64-linux/mpn/sub_n.s index 2ae18233ca..8c1db0a02f 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/sub_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/sub_n.s @@ -94,20 +94,18 @@ __gmpn_sub_nc: - mov %ecx, %eax - shr $3, %rcx - and $7, %eax - - lea .Ltab(%rip), %r9 - neg %r8 + shr $2, %rcx + and $3, %eax + bt $0, %r8 + jrcxz .Llt4 - movslq (%r9,%rax,4), %rax - lea (%r9,%rax), %rax - jmp *%rax + mov (%rsi), %r8 + mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid .size __gmpn_sub_nc,.-__gmpn_sub_nc - .align 16, 0x90 .globl __gmpn_sub_n .type __gmpn_sub_n,@function @@ -115,159 +113,82 @@ __gmpn_sub_nc: __gmpn_sub_n: - mov %ecx, %eax - shr $3, %rcx - and $7, %eax - - lea .Ltab(%rip), %r9 + shr $2, %rcx + and $3, %eax + jrcxz .Llt4 - movslq (%r9,%rax,4), %rax - lea (%r9,%rax), %rax - jmp *%rax - - -.L0: mov (%rsi), %r8 + mov (%rsi), %r8 mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid + +.Llt4: dec %eax + mov (%rsi), %r8 + jnz .L2 sbb (%rdx), %r8 - jmp .Le0 + mov %r8, (%rdi) + adc %eax, %eax + + ret -.L4: mov (%rsi), %r8 +.L2: dec %eax mov 8(%rsi), %r9 + jnz .L3 sbb (%rdx), %r8 - lea -32(%rsi), %rsi - lea -32(%rdx), %rdx - lea -32(%rdi), %rdi - inc %rcx - jmp .Le4 - -.L5: mov (%rsi), %r11 - mov 8(%rsi), %r8 - mov 16(%rsi), %r9 - sbb (%rdx), %r11 - lea -24(%rsi), %rsi - lea -24(%rdx), %rdx - lea -24(%rdi), %rdi - inc %rcx - jmp .Le5 - -.L6: mov (%rsi), %r10 - sbb (%rdx), %r10 - mov 8(%rsi), %r11 - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi - inc %rcx - jmp .Le6 - -.L7: mov (%rsi), %r9 - mov 8(%rsi), %r10 - sbb (%rdx), %r9 - sbb 8(%rdx), %r10 - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi - inc %rcx - jmp .Le7 + sbb 8(%rdx), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + adc %eax, %eax + + ret - .align 16, 0x90 -.Ltop: -.Le3: mov %r9, 40(%rdi) -.Le2: mov %r10, 48(%rdi) -.Le1: mov (%rsi), %r8 - mov 8(%rsi), %r9 +.L3: mov 16(%rsi), %r10 sbb (%rdx), %r8 - mov %r11, 56(%rdi) - lea 64(%rdi), %rdi -.Le0: mov 16(%rsi), %r10 sbb 8(%rdx), %r9 sbb 16(%rdx), %r10 mov %r8, (%rdi) -.Le7: mov 24(%rsi), %r11 mov %r9, 8(%rdi) -.Le6: mov 32(%rsi), %r8 - mov 40(%rsi), %r9 - sbb 24(%rdx), %r11 mov %r10, 16(%rdi) -.Le5: sbb 32(%rdx), %r8 - mov %r11, 24(%rdi) -.Le4: mov 48(%rsi), %r10 - mov 56(%rsi), %r11 - mov %r8, 32(%rdi) - lea 64(%rsi), %rsi - sbb 40(%rdx), %r9 - sbb 48(%rdx), %r10 - sbb 56(%rdx), %r11 - lea 64(%rdx), %rdx - dec %rcx - jnz .Ltop - -.Lend: mov %r9, 40(%rdi) - mov %r10, 48(%rdi) - mov %r11, 56(%rdi) - mov %ecx, %eax - adc %ecx, %eax + setc %al ret .align 16, 0x90 -.L3: mov (%rsi), %r9 - mov 8(%rsi), %r10 - mov 16(%rsi), %r11 - sbb (%rdx), %r9 - sbb 8(%rdx), %r10 - sbb 16(%rdx), %r11 - jrcxz .Lx3 - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea -40(%rdi), %rdi - jmp .Le3 -.Lx3: mov %r9, (%rdi) - mov %r10, 8(%rdi) - mov %r11, 16(%rdi) - mov %ecx, %eax - adc %ecx, %eax - - ret +.Ltop: sbb (%rdx), %r8 + sbb 8(%rdx), %r9 + sbb 16(%rdx), %r10 + sbb 24(%rdx), %r11 + mov %r8, (%rdi) + lea 32(%rsi), %rsi + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + dec %rcx + mov %r11, 24(%rdi) + lea 32(%rdx), %rdx + mov (%rsi), %r8 + mov 8(%rsi), %r9 + lea 32(%rdi), %rdi +.Lmid: mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + jnz .Ltop - .align 16, 0x90 -.L1: mov (%rsi), %r11 - sbb (%rdx), %r11 - jrcxz .Lx1 - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea -56(%rdi), %rdi - jmp .Le1 -.Lx1: mov %r11, (%rdi) - mov %ecx, %eax - adc %ecx, %eax - - ret +.Lend: lea 32(%rsi), %rsi + sbb (%rdx), %r8 + sbb 8(%rdx), %r9 + sbb 16(%rdx), %r10 + sbb 24(%rdx), %r11 + lea 32(%rdx), %rdx + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + mov %r11, 24(%rdi) + lea 32(%rdi), %rdi - .align 16, 0x90 -.L2: mov (%rsi), %r10 - mov 8(%rsi), %r11 - sbb (%rdx), %r10 - sbb 8(%rdx), %r11 - jrcxz .Lx2 - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea -48(%rdi), %rdi - jmp .Le2 -.Lx2: mov %r10, (%rdi) - mov %r11, 8(%rdi) - mov %ecx, %eax - adc %ecx, %eax + inc %eax + dec %eax + jnz .Llt4 + adc %eax, %eax ret .size __gmpn_sub_n,.-__gmpn_sub_n - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab - .long .L4-.Ltab - .long .L5-.Ltab - .long .L6-.Ltab - .long .L7-.Ltab diff --git a/ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s b/ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s index cabbb914a0..d257a0544b 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s @@ -58,14 +58,6 @@ - - - - - - - - @@ -76,7 +68,7 @@ .text - .align 8, 0x90 + .align 16, 0x90 .globl __gmpn_sublsh1_n .type __gmpn_sublsh1_n,@function @@ -84,107 +76,100 @@ __gmpn_sublsh1_n: push %rbx - push %r12 + push %rbp + mov (%rdx), %r8 mov %ecx, %eax - lea 24(%rsi,%rcx,8), %rsi - lea 24(%rdx,%rcx,8), %rdx - lea 24(%rdi,%rcx,8), %rdi + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx neg %rcx - - xor %r11d, %r11d - - mov -24(%rdx,%rcx,8), %r8 - shrd $63, %r8, %r11 - + xor %ebp, %ebp and $3, %eax - je .Lb0 + je .Lb00 cmp $2, %eax - jc .Lb1 - je .Lb2 - -.Lb3: mov -16(%rdx,%rcx,8), %r9 - shrd $63, %r9, %r8 - mov -8(%rdx,%rcx,8), %r10 - shrd $63, %r10, %r9 - mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - mov -8(%rsi,%rcx,8), %r12 - sbb %r9, %r12 - mov %r12, -8(%rdi,%rcx,8) - mov %r10, %r11 + jc .Lb01 + je .Lb10 + +.Lb11: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 sbb %eax, %eax + mov (%rsi,%rcx,8), %rbp + mov 8(%rsi,%rcx,8), %rbx + sub %r8, %rbp + sbb %r9, %rbx + mov %rbp, (%rdi,%rcx,8) + mov %rbx, 8(%rdi,%rcx,8) + mov 16(%rsi,%rcx,8), %rbp + sbb %r10, %rbp + mov %rbp, 16(%rdi,%rcx,8) + sbb %ebp, %ebp add $3, %rcx - js .Ltop - jmp .Lend + jmp .Lent -.Lb1: mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov %r8, %r11 - sbb %eax, %eax - inc %rcx - js .Ltop - jmp .Lend - -.Lb2: mov -16(%rdx,%rcx,8), %r9 - shrd $63, %r9, %r8 - mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - mov %r9, %r11 +.Lb10: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 sbb %eax, %eax + mov (%rsi,%rcx,8), %rbp + mov 8(%rsi,%rcx,8), %rbx + sub %r8, %rbp + sbb %r9, %rbx + mov %rbp, (%rdi,%rcx,8) + mov %rbx, 8(%rdi,%rcx,8) + sbb %ebp, %ebp add $2, %rcx - js .Ltop - jmp .Lend + jmp .Lent - .align 16, 0x90 -.Ltop: mov -24(%rdx,%rcx,8), %r8 - shrd $63, %r8, %r11 -.Lb0: mov -16(%rdx,%rcx,8), %r9 - shrd $63, %r9, %r8 - mov -8(%rdx,%rcx,8), %r10 - shrd $63, %r10, %r9 - mov (%rdx,%rcx,8), %rbx - shrd $63, %rbx, %r10 - - add %eax, %eax - - mov -24(%rsi,%rcx,8), %r12 - sbb %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - - mov -8(%rsi,%rcx,8), %r12 - sbb %r9, %r12 - mov %r12, -8(%rdi,%rcx,8) - - mov (%rsi,%rcx,8), %r12 - sbb %r10, %r12 - mov %r12, (%rdi,%rcx,8) - - mov %rbx, %r11 +.Lb01: add %r8, %r8 sbb %eax, %eax + mov (%rsi,%rcx,8), %rbp + sub %r8, %rbp + mov %rbp, (%rdi,%rcx,8) + sbb %ebp, %ebp + inc %rcx +.Lent: jns .Lend + + .align 16, 0x90 +.Ltop: add %eax, %eax + mov (%rdx,%rcx,8), %r8 +.Lb00: adc %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 + mov 24(%rdx,%rcx,8), %r11 + adc %r11, %r11 + + sbb %eax, %eax + add %ebp, %ebp + + mov (%rsi,%rcx,8), %rbp + mov 8(%rsi,%rcx,8), %rbx + sbb %r8, %rbp + sbb %r9, %rbx + mov %rbp, (%rdi,%rcx,8) + mov %rbx, 8(%rdi,%rcx,8) + mov 16(%rsi,%rcx,8), %rbp + mov 24(%rsi,%rcx,8), %rbx + sbb %r10, %rbp + sbb %r11, %rbx + mov %rbp, 16(%rdi,%rcx,8) + mov %rbx, 24(%rdi,%rcx,8) + + sbb %ebp, %ebp add $4, %rcx js .Ltop -.Lend: shr $63, %r11 - pop %r12 - pop %rbx - sub %r11d, %eax +.Lend: add %ebp, %eax neg %eax + + pop %rbp + pop %rbx ret .size __gmpn_sublsh1_n,.-__gmpn_sublsh1_n - diff --git a/ext/gmp/gen/x86_64-linux/mpn/sublsh2_n.s b/ext/gmp/gen/x86_64-linux/mpn/sublsh2_n.s deleted file mode 100644 index d5bf3a7be3..0000000000 --- a/ext/gmp/gen/x86_64-linux/mpn/sublsh2_n.s +++ /dev/null @@ -1,190 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - .text - .align 8, 0x90 - .globl __gmpn_sublsh2_n - .type __gmpn_sublsh2_n,@function - -__gmpn_sublsh2_n: - - - push %rbx - push %r12 - - mov %ecx, %eax - lea 24(%rsi,%rcx,8), %rsi - lea 24(%rdx,%rcx,8), %rdx - lea 24(%rdi,%rcx,8), %rdi - neg %rcx - - xor %r11d, %r11d - - mov -24(%rdx,%rcx,8), %r8 - shrd $62, %r8, %r11 - - and $3, %eax - je .Lb0 - cmp $2, %eax - jc .Lb1 - je .Lb2 - -.Lb3: mov -16(%rdx,%rcx,8), %r9 - shrd $62, %r9, %r8 - mov -8(%rdx,%rcx,8), %r10 - shrd $62, %r10, %r9 - mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - mov -8(%rsi,%rcx,8), %r12 - sbb %r9, %r12 - mov %r12, -8(%rdi,%rcx,8) - mov %r10, %r11 - sbb %eax, %eax - add $3, %rcx - js .Ltop - jmp .Lend - -.Lb1: mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov %r8, %r11 - sbb %eax, %eax - inc %rcx - js .Ltop - jmp .Lend - -.Lb2: mov -16(%rdx,%rcx,8), %r9 - shrd $62, %r9, %r8 - mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - mov %r9, %r11 - sbb %eax, %eax - add $2, %rcx - js .Ltop - jmp .Lend - - .align 16, 0x90 -.Ltop: mov -24(%rdx,%rcx,8), %r8 - shrd $62, %r8, %r11 -.Lb0: mov -16(%rdx,%rcx,8), %r9 - shrd $62, %r9, %r8 - mov -8(%rdx,%rcx,8), %r10 - shrd $62, %r10, %r9 - mov (%rdx,%rcx,8), %rbx - shrd $62, %rbx, %r10 - - add %eax, %eax - - mov -24(%rsi,%rcx,8), %r12 - sbb %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - - mov -8(%rsi,%rcx,8), %r12 - sbb %r9, %r12 - mov %r12, -8(%rdi,%rcx,8) - - mov (%rsi,%rcx,8), %r12 - sbb %r10, %r12 - mov %r12, (%rdi,%rcx,8) - - mov %rbx, %r11 - sbb %eax, %eax - - add $4, %rcx - js .Ltop - -.Lend: shr $62, %r11 - pop %r12 - pop %rbx - sub %r11d, %eax - neg %eax - - ret - .size __gmpn_sublsh2_n,.-__gmpn_sublsh2_n - diff --git a/ext/gmp/gen/x86_64-linux/mpn/submul_1.s b/ext/gmp/gen/x86_64-linux/mpn/submul_1.s index 07aaadb7bb..5e34932b8d 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/submul_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/submul_1.s @@ -68,6 +68,7 @@ + @@ -78,10 +79,8 @@ - - - - + + @@ -89,6 +88,7 @@ + .text .align 16, 0x90 @@ -97,115 +97,100 @@ __gmpn_submul_1: - + + + + + + mov (%rsi), %rax push %rbx - push %rbp - push %r12 - push %r13 - - mov %rdx, %rbp - mov %rcx, %rdx - - test $1, %bpl - jnz .Lbx1 - -.Lbx0: shr $2, %rbp - jc .Lb10 - -.Lb00: .byte 0xc4,98,147,0xf6,38 - .byte 0xc4,226,227,0xf6,70,8 - add %r12, %rbx - adc $0, %rax - mov (%rdi), %r12 - mov 8(%rdi), %rcx - .byte 0xc4,98,179,0xf6,70,16 - lea -16(%rdi), %rdi - lea 16(%rsi), %rsi - sub %r13, %r12 - jmp .Llo0 - -.Lbx1: shr $2, %rbp - jc .Lb11 - -.Lb01: .byte 0xc4,98,163,0xf6,22 - jnz .Lgt1 -.Ln1: sub %r11, (%rdi) - mov $0, %eax - adc %r10, %rax - jmp .Lret + mov %rdx, %rbx + + mul %rcx + mov %rbx, %r11 -.Lgt1: .byte 0xc4,98,147,0xf6,102,8 - .byte 0xc4,226,227,0xf6,70,16 - lea 24(%rsi), %rsi - add %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov (%rdi), %r10 - mov 8(%rdi), %r12 - mov 16(%rdi), %rcx - lea -8(%rdi), %rdi - sub %r11, %r10 - jmp .Llo1 - -.Lb11: .byte 0xc4,226,227,0xf6,6 - mov (%rdi), %rcx - .byte 0xc4,98,179,0xf6,70,8 - lea 8(%rsi), %rsi - lea -24(%rdi), %rdi - inc %rbp - sub %rbx, %rcx - jmp .Llo3 - -.Lb10: .byte 0xc4,98,179,0xf6,6 - .byte 0xc4,98,163,0xf6,86,8 - lea -32(%rdi), %rdi - mov $0, %eax - clc - jz .Lend + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jz .Lb2 + jg .Lb3 + +.Lb1: dec %r11 + jne .Lgt1 + sub %rax, (%rdi) + jmp .Lret +.Lgt1: lea 8(%rsi,%r11,8), %rsi + lea -8(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (%rsi,%r11,8), %rax + mov %rdx, %r8 + jmp .LL1 + +.Lb0: lea (%rsi,%r11,8), %rsi + lea -16(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp .LL0 + +.Lb3: lea -8(%rsi,%r11,8), %rsi + lea -24(%rdi,%r11,8), %rdi + neg %r11 + mov %rax, %rbx + mov %rdx, %r10 + jmp .LL3 + +.Lb2: lea -16(%rsi,%r11,8), %rsi + lea -32(%rdi,%r11,8), %rdi + neg %r11 + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %rdx, %r9 + jmp .LL2 .align 16, 0x90 -.Ltop: adc %rax, %r9 - lea 32(%rdi), %rdi - adc %r8, %r11 - .byte 0xc4,98,147,0xf6,102,16 - mov (%rdi), %r8 - .byte 0xc4,226,227,0xf6,70,24 - lea 32(%rsi), %rsi - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov 8(%rdi), %r10 - mov 16(%rdi), %r12 - sub %r9, %r8 - mov 24(%rdi), %rcx - mov %r8, (%rdi) - sbb %r11, %r10 -.Llo1: .byte 0xc4,98,179,0xf6,6 - mov %r10, 8(%rdi) - sbb %r13, %r12 -.Llo0: mov %r12, 16(%rdi) - sbb %rbx, %rcx -.Llo3: .byte 0xc4,98,163,0xf6,86,8 - mov %rcx, 24(%rdi) - dec %rbp - jnz .Ltop - -.Lend: adc %rax, %r9 - adc %r8, %r11 - mov 32(%rdi), %r8 - mov %r10, %rax - adc $0, %rax - mov 40(%rdi), %r10 - sub %r9, %r8 - mov %r8, 32(%rdi) - sbb %r11, %r10 - mov %r10, 40(%rdi) - adc $0, %rax - -.Lret: pop %r13 - pop %r12 - pop %rbp +.Ltop: sub %r10, (%rdi,%r11,8) + adc %rax, %r9 + mov (%rsi,%r11,8), %rax + adc %rdx, %r8 + mov $0, %r10d +.LL1: mul %rcx + sub %r9, 8(%rdi,%r11,8) + adc %rax, %r8 + adc %rdx, %rbx +.LL0: mov 8(%rsi,%r11,8), %rax + mul %rcx + sub %r8, 16(%rdi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 +.LL3: mov 16(%rsi,%r11,8), %rax + mul %rcx + sub %rbx, 24(%rdi,%r11,8) + mov $0, %r8d + mov %r8, %rbx + adc %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %r8, %r9 + adc %rdx, %r9 +.LL2: mul %rcx + add $4, %r11 + js .Ltop + + sub %r10, (%rdi,%r11,8) + adc %rax, %r9 + adc %r8, %rdx + sub %r9, 8(%rdi,%r11,8) +.Lret: adc $0, %rdx + mov %rdx, %rax + pop %rbx - + + ret .size __gmpn_submul_1,.-__gmpn_submul_1 diff --git a/ext/gmp/gen/x86_64-linux/mpn/xnor_n.s b/ext/gmp/gen/x86_64-linux/mpn/xnor_n.s index 360b9b8869..4db0497767 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/xnor_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/xnor_n.s @@ -92,7 +92,6 @@ - .text @@ -106,54 +105,46 @@ __gmpn_xnor_n: mov (%rdx), %r8 not %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: xor (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: xor (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 +.Ltop: mov (%rdx,%rcx,8), %r8 not %r8 -.Lb00: mov 8(%rdx), %r9 +.Lb00: mov 8(%rdx,%rcx,8), %r9 not %r9 - xor (%rsi), %r8 - xor 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 + xor (%rsi,%rcx,8), %r8 + xor 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 not %r8 -.Le10: mov 24(%rdx), %r9 +.Le10: mov 24(%rdx,%rcx,8), %r9 not %r9 - lea 32(%rdx), %rdx - xor 16(%rsi), %r8 - xor 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + xor 16(%rsi,%rcx,8), %r8 + xor 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/xor_n.s b/ext/gmp/gen/x86_64-linux/mpn/xor_n.s index 6889f2720a..8ef14d059c 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/xor_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/xor_n.s @@ -90,7 +90,6 @@ - .text @@ -103,50 +102,42 @@ __gmpn_xor_n: mov (%rdx), %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: xor (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: xor (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx +.Lb01: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 -.Lb00: mov 8(%rdx), %r9 - xor (%rsi), %r8 - xor 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 -.Le10: mov 24(%rdx), %r9 - lea 32(%rdx), %rdx - xor 16(%rsi), %r8 - xor 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + xor (%rsi,%rcx,8), %r8 + xor 8(%rsi,%rcx,8), %r9 + nop + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + xor 16(%rsi,%rcx,8), %r8 + xor 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/sieve_table.h b/ext/gmp/gen/x86_64-linux/sieve_table.h new file mode 100644 index 0000000000..ee9ac14360 --- /dev/null +++ b/ext/gmp/gen/x86_64-linux/sieve_table.h @@ -0,0 +1,46 @@ +/* This file generated by gen-sieve.c - DO NOT EDIT. */ + +#if GMP_LIMB_BITS != 64 +Error, error, this data is for 64 bits +#endif + +#define PRIMESIEVE_INIT_TABLE \ + CNST_LIMB (0x3294C9E069128480), /* 5 - 196 (42 primes) */ \ + CNST_LIMB (0x95A35E1EC4AB21DC), /* 197 - 388 (32 primes) */ \ + CNST_LIMB (0x4AD7CE99B8693366), /* 389 - 580 (30 primes) */ \ + CNST_LIMB (0x6595B6DA728DC52B), /* 581 - 772 (30 primes) */ \ + CNST_LIMB (0xEA6D9F8787B0CEDE), /* 773 - 964 (26 primes) */ \ + CNST_LIMB (0x3F56A1F4CD3275A9), /* 965 - 1156 (29 primes) */ \ + CNST_LIMB (0xFD3848FB74A76ADB), /* 1157 - 1348 (26 primes) */ \ + CNST_LIMB (0xDBBA0DD1A1EDF6AF), /* 1349 - 1540 (25 primes) */ \ + CNST_LIMB (0xCEC7F17ED22799A5), /* 1541 - 1732 (27 primes) */ \ + CNST_LIMB (0xEAEC17BDBB717D56), /* 1733 - 1924 (24 primes) */ \ + CNST_LIMB (0x3B0EB7B3585AFCF3), /* 1925 - 2116 (26 primes) */ \ + CNST_LIMB (0xE563D8F69FDF6C4F), /* 2117 - 2308 (23 primes) */ \ + CNST_LIMB (0xFE5BA7ABA45E92FC), /* 2309 - 2500 (25 primes) */ \ + CNST_LIMB (0x158DEE6F3BF49B7D), /* 2501 - 2692 (24 primes) */ \ + CNST_LIMB (0xBE5A7BC4EDE6CD1A), /* 2693 - 2884 (26 primes) */ \ + CNST_LIMB (0xD7679B3FCA7BB6AD), /* 2885 - 3076 (22 primes) */ \ + CNST_LIMB (0xC3F66B971FEF37E9), /* 3077 - 3268 (22 primes) */ \ + CNST_LIMB (0x6F7EBCF339C953FD), /* 3269 - 3460 (22 primes) */ \ + CNST_LIMB (0xD5A5ECDCD235DBF0), /* 3461 - 3652 (27 primes) */ \ + CNST_LIMB (0xECFA7B2FD5B65E3B), /* 3653 - 3844 (22 primes) */ \ + CNST_LIMB (0xD28EFDF9C89F67B1), /* 3845 - 4036 (25 primes) */ \ + CNST_LIMB (0xCB7F7C7A3DD3AF4F), /* 4037 - 4228 (21 primes) */ \ + CNST_LIMB (0xEEBED6CDFF6B32CC), /* 4229 - 4420 (22 primes) */ \ + CNST_LIMB (0xD5BD73F85ECFA97C), /* 4421 - 4612 (23 primes) */ \ + CNST_LIMB (0x21FDBE4FBBAD48F7), /* 4613 - 4804 (24 primes) */ \ + CNST_LIMB (0x5E35A3B5EEB7FDE7), /* 4805 - 4996 (21 primes) */ \ + CNST_LIMB (0xD9EBFD53A7DBBCC9), /* 4997 - 5188 (22 primes) */ \ + CNST_LIMB (0xFF9EDEAF2EFE1F76), /* 5189 - 5380 (18 primes) */ +#define PRIMESIEVE_NUMBEROF_TABLE 28 +/* #define PRIMESIEVE_PRIMES_IN_TABLE 706 */ +#define PRIMESIEVE_HIGHEST_PRIME 5351 +/* #define PRIMESIEVE_FIRST_UNCHECKED 5381 */ + +#define SIEVE_MASK1 CNST_LIMB(0x3204C1A049120485) +#define SIEVE_MASKT CNST_LIMB(0xA1204892058) +#define SIEVE_2MSK1 CNST_LIMB(0x29048402110840A) +#define SIEVE_2MSK2 CNST_LIMB(0x9402180C40230184) +#define SIEVE_2MSKT CNST_LIMB(0x5021088402120) + diff --git a/ext/gmp/gen/x86_64-macos/config.h b/ext/gmp/gen/x86_64-macos/config.h new file mode 100644 index 0000000000..72cebebce1 --- /dev/null +++ b/ext/gmp/gen/x86_64-macos/config.h @@ -0,0 +1,672 @@ +/* config.h. Generated from config.in by configure. */ +/* config.in. Generated from configure.ac by autoheader. */ + +/* + +Copyright 1996-2022 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. +*/ + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* The gmp-mparam.h file (a string) the tune program should suggest updating. + */ +#define GMP_MPARAM_H_SUGGEST "./mpn/x86_64/skylake/gmp-mparam.h" + +/* Define to 1 if you have the `alarm' function. */ +#define HAVE_ALARM 1 + +/* Define to 1 if alloca() works (via gmp-impl.h). */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#define HAVE_ALLOCA_H 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((const)) */ +#define HAVE_ATTRIBUTE_CONST 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((malloc)) */ +#define HAVE_ATTRIBUTE_MALLOC 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((mode (XX))) + */ +#define HAVE_ATTRIBUTE_MODE 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((noreturn)) */ +#define HAVE_ATTRIBUTE_NORETURN 1 + +/* Define to 1 if you have the `attr_get' function. */ +/* #undef HAVE_ATTR_GET */ + +/* Define to 1 if tests/libtests has calling conventions checking for the CPU + */ +#define HAVE_CALLING_CONVENTIONS 1 + +/* Define to 1 if you have the `clock' function. */ +#define HAVE_CLOCK 1 + +/* Define to 1 if you have the `clock_gettime' function */ +#define HAVE_CLOCK_GETTIME 1 + +/* Define to 1 if you have the `cputime' function. */ +/* #undef HAVE_CPUTIME */ + +/* Define to 1 if you have the declaration of `fgetc', and to 0 if you don't. + */ +#define HAVE_DECL_FGETC 1 + +/* Define to 1 if you have the declaration of `fscanf', and to 0 if you don't. + */ +#define HAVE_DECL_FSCANF 1 + +/* Define to 1 if you have the declaration of `optarg', and to 0 if you don't. + */ +#define HAVE_DECL_OPTARG 1 + +/* Define to 1 if you have the declaration of `sys_errlist', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_ERRLIST 1 + +/* Define to 1 if you have the declaration of `sys_nerr', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_NERR 1 + +/* Define to 1 if you have the declaration of `ungetc', and to 0 if you don't. + */ +#define HAVE_DECL_UNGETC 1 + +/* Define to 1 if you have the declaration of `vfprintf', and to 0 if you + don't. */ +#define HAVE_DECL_VFPRINTF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define one of the following to 1 for the format of a `double'. + If your format is not among these choices, or you don't know what it is, + then leave all undefined. + IEEE_LITTLE_SWAPPED means little endian, but with the two 4-byte halves + swapped, as used by ARM CPUs in little endian mode. */ +/* #undef HAVE_DOUBLE_IEEE_BIG_ENDIAN */ +#define HAVE_DOUBLE_IEEE_LITTLE_ENDIAN 1 +/* #undef HAVE_DOUBLE_IEEE_LITTLE_SWAPPED */ +/* #undef HAVE_DOUBLE_VAX_D */ +/* #undef HAVE_DOUBLE_VAX_G */ +/* #undef HAVE_DOUBLE_CRAY_CFP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FLOAT_H 1 + +/* Define to 1 if you have the `getpagesize' function. */ +#define HAVE_GETPAGESIZE 1 + +/* Define to 1 if you have the `getrusage' function. */ +#define HAVE_GETRUSAGE 1 + +/* Define to 1 if you have the `getsysinfo' function. */ +/* #undef HAVE_GETSYSINFO */ + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((visibility)) + and __attribute__ ((alias)) */ +/* #undef HAVE_HIDDEN_ALIAS */ + +/* Define one of these to 1 for the host CPU family. + If your CPU is not in any of these families, leave all undefined. + For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */ +/* #undef HAVE_HOST_CPU_FAMILY_alpha */ +/* #undef HAVE_HOST_CPU_FAMILY_m68k */ +/* #undef HAVE_HOST_CPU_FAMILY_power */ +/* #undef HAVE_HOST_CPU_FAMILY_powerpc */ +/* #undef HAVE_HOST_CPU_FAMILY_x86 */ +#define HAVE_HOST_CPU_FAMILY_x86_64 1 + +/* Define one of the following to 1 for the host CPU, as per the output of + ./config.guess. If your CPU is not listed here, leave all undefined. */ +/* #undef HAVE_HOST_CPU_alphaev67 */ +/* #undef HAVE_HOST_CPU_alphaev68 */ +/* #undef HAVE_HOST_CPU_alphaev7 */ +/* #undef HAVE_HOST_CPU_m68020 */ +/* #undef HAVE_HOST_CPU_m68030 */ +/* #undef HAVE_HOST_CPU_m68040 */ +/* #undef HAVE_HOST_CPU_m68060 */ +/* #undef HAVE_HOST_CPU_m68360 */ +/* #undef HAVE_HOST_CPU_powerpc604 */ +/* #undef HAVE_HOST_CPU_powerpc604e */ +/* #undef HAVE_HOST_CPU_powerpc750 */ +/* #undef HAVE_HOST_CPU_powerpc7400 */ +/* #undef HAVE_HOST_CPU_supersparc */ +/* #undef HAVE_HOST_CPU_i386 */ +/* #undef HAVE_HOST_CPU_i586 */ +/* #undef HAVE_HOST_CPU_i686 */ +/* #undef HAVE_HOST_CPU_pentium */ +/* #undef HAVE_HOST_CPU_pentiummmx */ +/* #undef HAVE_HOST_CPU_pentiumpro */ +/* #undef HAVE_HOST_CPU_pentium2 */ +/* #undef HAVE_HOST_CPU_pentium3 */ +/* #undef HAVE_HOST_CPU_pentium4 */ +/* #undef HAVE_HOST_CPU_core2 */ +/* #undef HAVE_HOST_CPU_nehalem */ +/* #undef HAVE_HOST_CPU_westmere */ +/* #undef HAVE_HOST_CPU_sandybridge */ +/* #undef HAVE_HOST_CPU_ivybridge */ +/* #undef HAVE_HOST_CPU_haswell */ +/* #undef HAVE_HOST_CPU_broadwell */ +/* #undef HAVE_HOST_CPU_skylake */ +/* #undef HAVE_HOST_CPU_silvermont */ +/* #undef HAVE_HOST_CPU_goldmont */ +/* #undef HAVE_HOST_CPU_tremont */ +/* #undef HAVE_HOST_CPU_k8 */ +/* #undef HAVE_HOST_CPU_k10 */ +/* #undef HAVE_HOST_CPU_bulldozer */ +/* #undef HAVE_HOST_CPU_piledriver */ +/* #undef HAVE_HOST_CPU_steamroller */ +/* #undef HAVE_HOST_CPU_excavator */ +/* #undef HAVE_HOST_CPU_zen */ +/* #undef HAVE_HOST_CPU_bobcat */ +/* #undef HAVE_HOST_CPU_jaguar */ +/* #undef HAVE_HOST_CPU_s390_z900 */ +/* #undef HAVE_HOST_CPU_s390_z990 */ +/* #undef HAVE_HOST_CPU_s390_z9 */ +/* #undef HAVE_HOST_CPU_s390_z10 */ +/* #undef HAVE_HOST_CPU_s390_z196 */ +/* #undef HAVE_HOST_CPU_s390_z13 */ +/* #undef HAVE_HOST_CPU_s390_z14 */ +/* #undef HAVE_HOST_CPU_s390_z15 */ + +/* Define to 1 iff we have a s390 with 64-bit registers. */ +/* #undef HAVE_HOST_CPU_s390_zarch */ + +/* Define to 1 if the system has the type `intmax_t'. */ +#define HAVE_INTMAX_T 1 + +/* Define to 1 if the system has the type `intptr_t'. */ +#define HAVE_INTPTR_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_INVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LANGINFO_H 1 + +/* Define one of these to 1 for the endianness of `mp_limb_t'. + If the endianness is not a simple big or little, or you don't know what + it is, then leave both undefined. */ +/* #undef HAVE_LIMB_BIG_ENDIAN */ +#define HAVE_LIMB_LITTLE_ENDIAN 1 + +/* Define to 1 if you have the `localeconv' function. */ +#define HAVE_LOCALECONV 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if the system has the type `long double'. */ +#define HAVE_LONG_DOUBLE 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_MACHINE_HAL_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memset' function. */ +#define HAVE_MEMSET 1 + +/* Define to 1 if you have the `mmap' function. */ +#define HAVE_MMAP 1 + +/* Define to 1 if you have the `mprotect' function. */ +#define HAVE_MPROTECT 1 + +/* Define to 1 each of the following for which a native (ie. CPU specific) + implementation of the corresponding routine exists. */ +#define HAVE_NATIVE_mpn_add_n 1 +/* #undef HAVE_NATIVE_mpn_add_n_sub_n */ +#define HAVE_NATIVE_mpn_add_nc 1 +/* #undef HAVE_NATIVE_mpn_addaddmul_1msb0 */ +#define HAVE_NATIVE_mpn_addlsh1_n 1 +#define HAVE_NATIVE_mpn_addlsh2_n 1 +#define HAVE_NATIVE_mpn_addlsh_n 1 +#define HAVE_NATIVE_mpn_addlsh1_nc 1 +#define HAVE_NATIVE_mpn_addlsh2_nc 1 +/* #undef HAVE_NATIVE_mpn_addlsh_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addmul_1c */ +#define HAVE_NATIVE_mpn_addmul_2 1 +/* #undef HAVE_NATIVE_mpn_addmul_3 */ +/* #undef HAVE_NATIVE_mpn_addmul_4 */ +/* #undef HAVE_NATIVE_mpn_addmul_5 */ +/* #undef HAVE_NATIVE_mpn_addmul_6 */ +/* #undef HAVE_NATIVE_mpn_addmul_7 */ +/* #undef HAVE_NATIVE_mpn_addmul_8 */ +/* #undef HAVE_NATIVE_mpn_addmul_2s */ +#define HAVE_NATIVE_mpn_and_n 1 +#define HAVE_NATIVE_mpn_andn_n 1 +#define HAVE_NATIVE_mpn_bdiv_dbm1c 1 +#define HAVE_NATIVE_mpn_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_pi1_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_cnd_add_n 1 +#define HAVE_NATIVE_mpn_cnd_sub_n 1 +#define HAVE_NATIVE_mpn_com 1 +#define HAVE_NATIVE_mpn_copyd 1 +#define HAVE_NATIVE_mpn_copyi 1 +#define HAVE_NATIVE_mpn_div_qr_1n_pi1 1 +/* #undef HAVE_NATIVE_mpn_div_qr_2 */ +#define HAVE_NATIVE_mpn_divexact_1 1 +/* #undef HAVE_NATIVE_mpn_divexact_by3c */ +#define HAVE_NATIVE_mpn_divrem_1 1 +/* #undef HAVE_NATIVE_mpn_divrem_1c */ +#define HAVE_NATIVE_mpn_divrem_2 1 +/* #undef HAVE_NATIVE_mpn_gcd_1 */ +#define HAVE_NATIVE_mpn_gcd_11 1 +#define HAVE_NATIVE_mpn_gcd_22 1 +#define HAVE_NATIVE_mpn_hamdist 1 +#define HAVE_NATIVE_mpn_invert_limb 1 +#define HAVE_NATIVE_mpn_ior_n 1 +#define HAVE_NATIVE_mpn_iorn_n 1 +#define HAVE_NATIVE_mpn_lshift 1 +#define HAVE_NATIVE_mpn_lshiftc 1 +/* #undef HAVE_NATIVE_mpn_lshsub_n */ +/* #undef HAVE_NATIVE_mpn_mod_1 */ +#define HAVE_NATIVE_mpn_mod_1_1p 1 +/* #undef HAVE_NATIVE_mpn_mod_1c */ +#define HAVE_NATIVE_mpn_mod_1s_2p 1 +#define HAVE_NATIVE_mpn_mod_1s_4p 1 +#define HAVE_NATIVE_mpn_mod_34lsub1 1 +#define HAVE_NATIVE_mpn_modexact_1_odd 1 +#define HAVE_NATIVE_mpn_modexact_1c_odd 1 +#define HAVE_NATIVE_mpn_mul_1 1 +/* #undef HAVE_NATIVE_mpn_mul_1c */ +#define HAVE_NATIVE_mpn_mul_2 1 +/* #undef HAVE_NATIVE_mpn_mul_3 */ +/* #undef HAVE_NATIVE_mpn_mul_4 */ +/* #undef HAVE_NATIVE_mpn_mul_5 */ +/* #undef HAVE_NATIVE_mpn_mul_6 */ +#define HAVE_NATIVE_mpn_mul_basecase 1 +#define HAVE_NATIVE_mpn_mullo_basecase 1 +#define HAVE_NATIVE_mpn_nand_n 1 +#define HAVE_NATIVE_mpn_nior_n 1 +#define HAVE_NATIVE_mpn_popcount 1 +#define HAVE_NATIVE_mpn_preinv_divrem_1 1 +/* #undef HAVE_NATIVE_mpn_preinv_mod_1 */ +#define HAVE_NATIVE_mpn_redc_1 1 +/* #undef HAVE_NATIVE_mpn_redc_2 */ +#define HAVE_NATIVE_mpn_rsblsh1_n 1 +#define HAVE_NATIVE_mpn_rsblsh2_n 1 +#define HAVE_NATIVE_mpn_rsblsh_n 1 +#define HAVE_NATIVE_mpn_rsblsh1_nc 1 +/* #undef HAVE_NATIVE_mpn_rsblsh2_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh_nc */ +#define HAVE_NATIVE_mpn_rsh1add_n 1 +#define HAVE_NATIVE_mpn_rsh1add_nc 1 +#define HAVE_NATIVE_mpn_rsh1sub_n 1 +#define HAVE_NATIVE_mpn_rsh1sub_nc 1 +#define HAVE_NATIVE_mpn_rshift 1 +#define HAVE_NATIVE_mpn_sbpi1_bdiv_r 1 +#define HAVE_NATIVE_mpn_sqr_basecase 1 +/* #undef HAVE_NATIVE_mpn_sqr_diagonal */ +#define HAVE_NATIVE_mpn_sqr_diag_addlsh1 1 +#define HAVE_NATIVE_mpn_sub_n 1 +#define HAVE_NATIVE_mpn_sub_nc 1 +#define HAVE_NATIVE_mpn_sublsh1_n 1 +#define HAVE_NATIVE_mpn_sublsh2_n 1 +/* #undef HAVE_NATIVE_mpn_sublsh_n */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_submul_1c */ +/* #undef HAVE_NATIVE_mpn_tabselect */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd_r */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm_r */ +#define HAVE_NATIVE_mpn_xor_n 1 +#define HAVE_NATIVE_mpn_xnor_n 1 + +/* Define to 1 if you have the `nl_langinfo' function. */ +#define HAVE_NL_LANGINFO 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_NL_TYPES_H 1 + +/* Define to 1 if you have the `obstack_vprintf' function. */ +/* #undef HAVE_OBSTACK_VPRINTF */ + +/* Define to 1 if you have the `popen' function. */ +#define HAVE_POPEN 1 + +/* Define to 1 if you have the `processor_info' function. */ +#define HAVE_PROCESSOR_INFO 1 + +/* Define to 1 if `struct pst_processor' exists and contains + `psp_iticksperclktick'. */ +/* #undef HAVE_PSP_ITICKSPERCLKTICK */ + +/* Define to 1 if you have the `pstat_getprocessor' function. */ +/* #undef HAVE_PSTAT_GETPROCESSOR */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if the system has the type `quad_t'. */ +#define HAVE_QUAD_T 1 + +/* Define to 1 if you have the `raise' function. */ +#define HAVE_RAISE 1 + +/* Define to 1 if you have the `read_real_time' function. */ +/* #undef HAVE_READ_REAL_TIME */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `sigaltstack' function. */ +#define HAVE_SIGALTSTACK 1 + +/* Define to 1 if you have the `sigstack' function. */ +/* #undef HAVE_SIGSTACK */ + +/* Tune directory speed_cyclecounter, undef=none, 1=32bits, 2=64bits) */ +#define HAVE_SPEED_CYCLECOUNTER 2 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SSTREAM */ + +/* Define to 1 if the system has the type `stack_t'. */ +#define HAVE_STACK_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if the system has the type `std::locale'. */ +/* #undef HAVE_STD__LOCALE */ + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strnlen' function. */ +#define HAVE_STRNLEN 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the `sysctl' function. */ +#define HAVE_SYSCTL 1 + +/* Define to 1 if you have the `sysctlbyname' function. */ +#define HAVE_SYSCTLBYNAME 1 + +/* Define to 1 if you have the `syssgi' function. */ +/* #undef HAVE_SYSSGI */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_ATTRIBUTES_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_IOGRAPH_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_MMAN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PROCESSOR_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PSTAT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSCTL_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSSGI_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSTEMCFG_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIMES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the `times' function. */ +#define HAVE_TIMES 1 + +/* Define to 1 if the system has the type `uint_least32_t'. */ +#define HAVE_UINT_LEAST32_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the `vsnprintf' function and it works properly. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 for Windos/64 */ +/* #undef HOST_DOS64 */ + +/* Assembler local label prefix */ +#define LSYM_PREFIX "L" + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* Define to 1 to disable the use of inline assembly */ +/* #undef NO_ASM */ + +/* Name of package */ +#define PACKAGE "gmp" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org (see https://gmplib.org/manual/Reporting-Bugs.html)" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "GNU MP" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "GNU MP 6.3.0" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "gmp" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "http://www.gnu.org/software/gmp/" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "6.3.0" + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* The size of `mp_limb_t', as computed by sizeof. */ +#define SIZEOF_MP_LIMB_T 8 + +/* The size of `unsigned', as computed by sizeof. */ +#define SIZEOF_UNSIGNED 4 + +/* The size of `unsigned long', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_LONG 8 + +/* The size of `unsigned short', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_SHORT 2 + +/* The size of `void *', as computed by sizeof. */ +#define SIZEOF_VOID_P 8 + +/* Define to 1 if sscanf requires writable inputs */ +/* #undef SSCANF_WRITABLE_INPUT */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Maximum size the tune program can test for SQR_TOOM2_THRESHOLD */ +/* #undef TUNE_SQR_TOOM2_MAX */ + +/* Version number of package */ +#define VERSION "6.3.0" + +/* Define to 1 to enable ASSERT checking, per --enable-assert */ +/* #undef WANT_ASSERT */ + +/* Define to 1 to enable GMP_CPU_TYPE faking cpuid, per --enable-fake-cpuid */ +/* #undef WANT_FAKE_CPUID */ + +/* Define to 1 when building a fat binary. */ +/* #undef WANT_FAT_BINARY */ + +/* Define to 1 to enable FFTs for multiplication, per --enable-fft */ +#define WANT_FFT 1 + +/* Define to 1 to enable old mpn_mul_fft_full for multiplication, per + --enable-old-fft-full */ +/* #undef WANT_OLD_FFT_FULL */ + +/* Define to 1 if --enable-profiling=gprof */ +/* #undef WANT_PROFILING_GPROF */ + +/* Define to 1 if --enable-profiling=instrument */ +/* #undef WANT_PROFILING_INSTRUMENT */ + +/* Define to 1 if --enable-profiling=prof */ +/* #undef WANT_PROFILING_PROF */ + +/* Define one of these to 1 for the desired temporary memory allocation + method, per --enable-alloca. */ +#define WANT_TMP_ALLOCA 1 +/* #undef WANT_TMP_REENTRANT */ +/* #undef WANT_TMP_NOTREENTRANT */ +/* #undef WANT_TMP_DEBUG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to 1 if the assembler understands the mulx instruction */ +#define X86_ASM_MULX 1 + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +#define YYTEXT_POINTER 1 + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to the equivalent of the C99 'restrict' keyword, or to + nothing if this is not supported. Do not define if restrict is + supported directly. */ +#define restrict __restrict +/* Work around a bug in Sun C++: it does not support _Restrict or + __restrict__, even though the corresponding Sun C compiler ends up with + "#define restrict _Restrict" or "#define restrict __restrict__" in the + previous line. Perhaps some future version of Sun C++ will work with + restrict; if so, hopefully it defines __RESTRICT like Sun C does. */ +#if defined __SUNPRO_CC && !defined __RESTRICT +# define _Restrict +# define __restrict__ +#endif + +/* Define to empty if the keyword `volatile' does not work. Warning: valid + code using `volatile' can become incorrect without. Disable with care. */ +/* #undef volatile */ diff --git a/ext/gmp/gen/x86_64-macos/mpn/addlsh_n.s b/ext/gmp/gen/x86_64-macos/mpn/addlsh_n.s index 09035432cf..f71088eb48 100644 --- a/ext/gmp/gen/x86_64-macos/mpn/addlsh_n.s +++ b/ext/gmp/gen/x86_64-macos/mpn/addlsh_n.s @@ -89,6 +89,7 @@ + diff --git a/ext/gmp/gen/x86_64-macos/mpn/addmul_1.s b/ext/gmp/gen/x86_64-macos/mpn/addmul_1.s index b884829fe2..0435e8ae76 100644 --- a/ext/gmp/gen/x86_64-macos/mpn/addmul_1.s +++ b/ext/gmp/gen/x86_64-macos/mpn/addmul_1.s @@ -69,6 +69,8 @@ + + diff --git a/ext/gmp/gen/x86_64-macos/mpn/mul_basecase.s b/ext/gmp/gen/x86_64-macos/mpn/mul_basecase.s index 0e74e66cde..9d24fda96c 100644 --- a/ext/gmp/gen/x86_64-macos/mpn/mul_basecase.s +++ b/ext/gmp/gen/x86_64-macos/mpn/mul_basecase.s @@ -311,8 +311,7 @@ Ldone: ret -Lf2: - .byte 0xc4,98,171,0xf6,94,248 +Lf2: .byte 0xc4,98,171,0xf6,94,248 lea 8(%rdi,%rbx,8), %rdi .byte 0xc4,98,155,0xf6,14 diff --git a/ext/gmp/gen/x86_64-macos/mpn/rsblsh_n.s b/ext/gmp/gen/x86_64-macos/mpn/rsblsh_n.s index 0117c0d9c2..329c600b48 100644 --- a/ext/gmp/gen/x86_64-macos/mpn/rsblsh_n.s +++ b/ext/gmp/gen/x86_64-macos/mpn/rsblsh_n.s @@ -90,6 +90,7 @@ + diff --git a/ext/gmp/gen/x86_64-macos/mpn/sbpi1_bdiv_r.s b/ext/gmp/gen/x86_64-macos/mpn/sbpi1_bdiv_r.s new file mode 100644 index 0000000000..dcf3376688 --- /dev/null +++ b/ext/gmp/gen/x86_64-macos/mpn/sbpi1_bdiv_r.s @@ -0,0 +1,681 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_sbpi1_bdiv_r + + +___gmpn_sbpi1_bdiv_r: + + + + + lea Latab(%rip), %r10 + + cmp $8, %rcx + jbe Lsma + + + +Lgen: push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + mov %rdx, %r14 + xor %r13, %r13 + + sub %rcx, %rsi + + lea -8(,%rcx,8), %rbx + neg %rbx + mov %rcx, %rbp + mov %ecx, %eax + shr $3, %rbp + and $7, %eax + + movslq (%r10,%rax,4), %rax + lea (%rax,%r10), %rax + + mov (%rdi), %rdx + imul %r8, %rdx + jmp Louter + +Lf0: .byte 0xc4,66,171,0xf6,30 + lea -1(%rcx), %rcx + .byte 0xc4,66,155,0xf6,78,8 + lea -8(%r14), %r14 + .byte 0x66,77,0x0f,0x38,0xf6,227 + .byte 0xf3,76,0x0f,0x38,0xf6,23 + lea -8(%rdi), %rdi + jmp Lb0x + +Lf3: .byte 0xc4,66,155,0xf6,14 + .byte 0xc4,66,171,0xf6,94,8 + .byte 0xf3,76,0x0f,0x38,0xf6,39 + lea -48(%rdi), %rdi + lea 16(%r14), %r14 + jmp Lb3x + +Lf4: .byte 0xc4,66,171,0xf6,30 + .byte 0xc4,66,155,0xf6,78,8 + lea 24(%r14), %r14 + .byte 0xf3,76,0x0f,0x38,0xf6,23 + lea -40(%rdi), %rdi + .byte 0x66,77,0x0f,0x38,0xf6,227 + jmp Lb4x + +Lf5: .byte 0xc4,66,155,0xf6,14 + .byte 0xc4,66,171,0xf6,94,8 + lea 32(%r14), %r14 + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xf3,76,0x0f,0x38,0xf6,39 + lea -32(%rdi), %rdi + jmp Lb5x + +Lf6: .byte 0xc4,66,171,0xf6,30 + .byte 0xc4,66,155,0xf6,78,8 + lea 40(%r14), %r14 + .byte 0xf3,76,0x0f,0x38,0xf6,23 + lea -24(%rdi), %rdi + .byte 0x66,77,0x0f,0x38,0xf6,227 + jmp Lb6x + +Lf7: .byte 0xc4,66,155,0xf6,14 + .byte 0xc4,66,171,0xf6,94,8 + lea 48(%r14), %r14 + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xf3,76,0x0f,0x38,0xf6,39 + lea -16(%rdi), %rdi + jmp Lb7x + +Lf1: .byte 0xc4,66,155,0xf6,14 + .byte 0xc4,66,171,0xf6,94,8 + .byte 0xf3,76,0x0f,0x38,0xf6,39 + lea -1(%rcx), %rcx + jmp Lb1x + +Lf2: .byte 0xc4,66,171,0xf6,30 + .byte 0xc4,66,155,0xf6,78,8 + lea 8(%r14), %r14 + .byte 0xf3,76,0x0f,0x38,0xf6,23 + lea 8(%rdi), %rdi + .byte 0x66,77,0x0f,0x38,0xf6,227 + jmp Lb2x + +Lend: .byte 0xf3,76,0x0f,0x38,0xf6,39 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + mov %r12, (%rdi) + adc %rcx, %r9 + mov 8(%rdi,%rbx), %rdx + .byte 0xc4,66,235,0xf6,224 + bt $0, %r13d + adc %r9, 8(%rdi) + setc %r13b + dec %rsi + jz Ldone + + lea (%r14,%rbx), %r14 + lea 8(%rdi,%rbx), %rdi +Louter: + mov %rbp, %rcx + test %eax, %eax + jmp *%rax + + .align 4, 0x90 +Ltop: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r10, -8(%rdi) + jrcxz Lend +Lb2x: .byte 0xc4,66,171,0xf6,94,8 + .byte 0xf3,76,0x0f,0x38,0xf6,39 + lea -1(%rcx), %rcx + mov %r12, (%rdi) +Lb1x: .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xc4,66,155,0xf6,78,16 + .byte 0x66,77,0x0f,0x38,0xf6,227 + .byte 0xf3,76,0x0f,0x38,0xf6,87,8 + mov %r10, 8(%rdi) +Lb0x: .byte 0xc4,66,171,0xf6,94,24 + lea 64(%r14), %r14 + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xf3,76,0x0f,0x38,0xf6,103,16 + mov %r12, 16(%rdi) +Lb7x: .byte 0xc4,66,155,0xf6,78,224 + .byte 0xf3,76,0x0f,0x38,0xf6,87,24 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r10, 24(%rdi) +Lb6x: .byte 0xc4,66,171,0xf6,94,232 + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xf3,76,0x0f,0x38,0xf6,103,32 + mov %r12, 32(%rdi) +Lb5x: .byte 0xc4,66,155,0xf6,78,240 + .byte 0xf3,76,0x0f,0x38,0xf6,87,40 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r10, 40(%rdi) +Lb4x: .byte 0xf3,76,0x0f,0x38,0xf6,103,48 + .byte 0xc4,66,171,0xf6,94,248 + mov %r12, 48(%rdi) +Lb3x: lea 64(%rdi), %rdi + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xc4,66,155,0xf6,14 + jmp Ltop + +Ldone:mov %r13, %rax + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + +Lsma: + movslq 28(%r10,%rcx,4), %rax + lea (%rax,%r10), %rax + + jmp *%rax + +L1: mov (%rdx), %r10 + xor %eax, %eax + mov (%rdi), %rdx + dec %rsi + mov %rdx, %r9 +Lo1: .byte 0xc4,66,235,0xf6,216 + lea 8(%rdi), %rdi + .byte 0xc4,194,243,0xf6,210 + add %r9, %rcx + adc %rax, %rdx + add (%rdi), %rdx + setc %al + mov %rdx, %r9 + dec %rsi + jnz Lo1 + mov %r9, (%rdi) + + + ret + + +L2: push %r12 + push %r14 + + mov %rdx, %r14 + sub %rcx, %rsi + mov (%rdi), %rdx + imul %r8, %rdx + + + + push %rbx + push %r13 + xor %r13d, %r13d + mov (%rdi), %rax + mov 8(%rdi), %rbx +Lo2: xor %ecx, %ecx + .byte 0xc4,66,171,0xf6,30 + .byte 0xc4,66,235,0xf6,78,8 + .byte 0xf3,76,0x0f,0x38,0xf6,208 + .byte 0x66,73,0x0f,0x38,0xf6,211 + .byte 0xf3,72,0x0f,0x38,0xf6,211 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + mov %rdx, %rax + adc %rcx, %r9 + imul %r8, %rdx + bt $0, %r13d + adc 16(%rdi), %r9 + mov %r9, %rbx + setc %r13b + lea 8(%rdi), %rdi + dec %rsi + jnz Lo2 + + mov %rax, (%rdi) + mov %rbx, 8(%rdi) + mov %r13, %rax + pop %r13 + pop %rbx + + + + pop %r14 + pop %r12 + + ret + + +L3: push %rbx + push %r12 + push %r13 + push %r14 + + mov %rdx, %r14 + xor %r13, %r13 + sub %rcx, %rsi + mov (%rdi), %rax + mov 8(%rdi), %rbx + mov %rax, %rdx + imul %r8, %rdx +Lo3: xor %ecx, %ecx + .byte 0xc4,66,155,0xf6,14 + .byte 0xf3,76,0x0f,0x38,0xf6,224 + .byte 0xc4,66,251,0xf6,94,8 + .byte 0x66,73,0x0f,0x38,0xf6,193 + .byte 0xf3,72,0x0f,0x38,0xf6,195 + .byte 0xc4,66,227,0xf6,78,16 + mov %r8, %rdx + .byte 0xc4,98,235,0xf6,224 + .byte 0x66,73,0x0f,0x38,0xf6,219 + .byte 0xf3,72,0x0f,0x38,0xf6,95,16 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + adc $0, %r9 + bt $0, %r13d + adc %r9, 24(%rdi) + setc %r13b + lea 8(%rdi), %rdi + dec %rsi + jnz Lo3 + jmp Lesma + + + +L4: push %rbx + push %r12 + push %r13 + push %r14 + + mov %rdx, %r14 + xor %r13, %r13 + sub %rcx, %rsi + mov (%rdi), %rax + mov 8(%rdi), %rbx + mov %rax, %rdx + imul %r8, %rdx +Lo4: xor %ecx, %ecx + .byte 0xc4,66,171,0xf6,30 + .byte 0xf3,76,0x0f,0x38,0xf6,208 + .byte 0xc4,66,251,0xf6,78,8 + .byte 0x66,73,0x0f,0x38,0xf6,195 + .byte 0xf3,72,0x0f,0x38,0xf6,195 + .byte 0xc4,66,227,0xf6,94,16 + .byte 0x66,73,0x0f,0x38,0xf6,217 + .byte 0xc4,66,155,0xf6,78,24 + mov %r8, %rdx + .byte 0xc4,98,235,0xf6,208 + .byte 0xf3,72,0x0f,0x38,0xf6,95,16 + .byte 0x66,77,0x0f,0x38,0xf6,227 + .byte 0xf3,76,0x0f,0x38,0xf6,103,24 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + mov %r12, 24(%rdi) + adc %rcx, %r9 + bt $0, %r13d + adc %r9, 32(%rdi) + setc %r13b + lea 8(%rdi), %rdi + dec %rsi + jnz Lo4 + jmp Lesma + + + +L5: push %rbx + push %r12 + push %r13 + push %r14 + + mov %rdx, %r14 + xor %r13, %r13 + sub %rcx, %rsi + mov (%rdi), %rax + mov 8(%rdi), %rbx + mov %rax, %rdx + imul %r8, %rdx +Lo5: xor %ecx, %ecx + .byte 0xc4,66,155,0xf6,14 + .byte 0xf3,76,0x0f,0x38,0xf6,224 + .byte 0xc4,66,251,0xf6,94,8 + .byte 0x66,73,0x0f,0x38,0xf6,193 + .byte 0xf3,72,0x0f,0x38,0xf6,195 + .byte 0xc4,66,227,0xf6,78,16 + .byte 0x66,73,0x0f,0x38,0xf6,219 + .byte 0xf3,72,0x0f,0x38,0xf6,95,16 + .byte 0xc4,66,171,0xf6,94,24 + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xc4,66,155,0xf6,78,32 + .byte 0xf3,76,0x0f,0x38,0xf6,87,24 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r8, %rdx + .byte 0xc4,98,235,0xf6,216 + mov %r10, 24(%rdi) + .byte 0xf3,76,0x0f,0x38,0xf6,103,32 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + mov %r12, 32(%rdi) + adc %rcx, %r9 + bt $0, %r13d + adc %r9, 40(%rdi) + setc %r13b + lea 8(%rdi), %rdi + dec %rsi + jnz Lo5 + jmp Lesma + + + +L6: push %rbx + push %r12 + push %r13 + push %r14 + + mov %rdx, %r14 + xor %r13, %r13 + sub %rcx, %rsi + mov (%rdi), %rax + mov 8(%rdi), %rbx + mov %rax, %rdx + imul %r8, %rdx +Lo6: xor %ecx, %ecx + .byte 0xc4,66,171,0xf6,30 + .byte 0xf3,76,0x0f,0x38,0xf6,208 + .byte 0xc4,66,251,0xf6,78,8 + .byte 0x66,73,0x0f,0x38,0xf6,195 + .byte 0xf3,72,0x0f,0x38,0xf6,195 + .byte 0xc4,66,227,0xf6,94,16 + .byte 0x66,73,0x0f,0x38,0xf6,217 + .byte 0xc4,66,155,0xf6,78,24 + .byte 0xf3,72,0x0f,0x38,0xf6,95,16 + .byte 0x66,77,0x0f,0x38,0xf6,227 + .byte 0xf3,76,0x0f,0x38,0xf6,103,24 + .byte 0xc4,66,171,0xf6,94,32 + mov %r12, 24(%rdi) + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xc4,66,155,0xf6,78,40 + .byte 0xf3,76,0x0f,0x38,0xf6,87,32 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r8, %rdx + .byte 0xc4,98,235,0xf6,216 + mov %r10, 32(%rdi) + .byte 0xf3,76,0x0f,0x38,0xf6,103,40 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + mov %r12, 40(%rdi) + adc %rcx, %r9 + bt $0, %r13d + adc %r9, 48(%rdi) + setc %r13b + lea 8(%rdi), %rdi + dec %rsi + jnz Lo6 + jmp Lesma + + + +L7: push %rbx + push %r12 + push %r13 + push %r14 + + mov %rdx, %r14 + xor %r13, %r13 + sub %rcx, %rsi + mov (%rdi), %rax + mov 8(%rdi), %rbx + mov %rax, %rdx + imul %r8, %rdx +Lo7: xor %ecx, %ecx + .byte 0xc4,66,155,0xf6,14 + .byte 0xf3,76,0x0f,0x38,0xf6,224 + .byte 0xc4,66,251,0xf6,94,8 + .byte 0x66,73,0x0f,0x38,0xf6,193 + .byte 0xf3,72,0x0f,0x38,0xf6,195 + .byte 0xc4,66,227,0xf6,78,16 + .byte 0x66,73,0x0f,0x38,0xf6,219 + .byte 0xc4,66,171,0xf6,94,24 + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xf3,72,0x0f,0x38,0xf6,95,16 + .byte 0xc4,66,155,0xf6,78,32 + .byte 0xf3,76,0x0f,0x38,0xf6,87,24 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r10, 24(%rdi) + .byte 0xf3,76,0x0f,0x38,0xf6,103,32 + .byte 0xc4,66,171,0xf6,94,40 + mov %r12, 32(%rdi) + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xc4,66,155,0xf6,78,48 + .byte 0xf3,76,0x0f,0x38,0xf6,87,40 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r10, 40(%rdi) + mov %rax, %rdx + .byte 0xc4,66,235,0xf6,208 + .byte 0xf3,76,0x0f,0x38,0xf6,103,48 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + mov %r12, 48(%rdi) + adc %rcx, %r9 + bt $0, %r13d + adc %r9, 56(%rdi) + setc %r13b + lea 8(%rdi), %rdi + dec %rsi + jnz Lo7 + jmp Lesma + + + +L8: push %rbx + push %r12 + push %r13 + push %r14 + + mov %rdx, %r14 + xor %r13, %r13 + sub %rcx, %rsi + mov (%rdi), %rax + mov 8(%rdi), %rbx + mov %rax, %rdx + imul %r8, %rdx +Lo8: xor %ecx, %ecx + .byte 0xc4,66,171,0xf6,30 + .byte 0xf3,76,0x0f,0x38,0xf6,208 + .byte 0xc4,66,251,0xf6,78,8 + .byte 0x66,73,0x0f,0x38,0xf6,195 + .byte 0xf3,72,0x0f,0x38,0xf6,195 + .byte 0xc4,66,227,0xf6,94,16 + .byte 0x66,73,0x0f,0x38,0xf6,217 + .byte 0xc4,66,155,0xf6,78,24 + .byte 0xf3,72,0x0f,0x38,0xf6,95,16 + .byte 0x66,77,0x0f,0x38,0xf6,227 + .byte 0xc4,66,171,0xf6,94,32 + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xf3,76,0x0f,0x38,0xf6,103,24 + mov %r12, 24(%rdi) + .byte 0xc4,66,155,0xf6,78,40 + .byte 0xf3,76,0x0f,0x38,0xf6,87,32 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r10, 32(%rdi) + .byte 0xf3,76,0x0f,0x38,0xf6,103,40 + .byte 0xc4,66,171,0xf6,94,48 + mov %r12, 40(%rdi) + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xc4,66,155,0xf6,78,56 + .byte 0xf3,76,0x0f,0x38,0xf6,87,48 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r8, %rdx + .byte 0xc4,98,235,0xf6,216 + mov %r10, 48(%rdi) + .byte 0xf3,76,0x0f,0x38,0xf6,103,56 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + mov %r12, 56(%rdi) + adc %rcx, %r9 + bt $0, %r13d + adc %r9, 64(%rdi) + setc %r13b + lea 8(%rdi), %rdi + dec %rsi + jnz Lo8 + jmp Lesma + + +Lesma:mov %rax, (%rdi) + mov %rbx, 8(%rdi) + mov %r13, %rax + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + ret + + + .text + .align 3, 0x90 +Latab:.set Lf0_tmp, Lf0-Latab + .long Lf0_tmp + + .set Lf1_tmp, Lf1-Latab + .long Lf1_tmp + + .set Lf2_tmp, Lf2-Latab + .long Lf2_tmp + + .set Lf3_tmp, Lf3-Latab + .long Lf3_tmp + + .set Lf4_tmp, Lf4-Latab + .long Lf4_tmp + + .set Lf5_tmp, Lf5-Latab + .long Lf5_tmp + + .set Lf6_tmp, Lf6-Latab + .long Lf6_tmp + + .set Lf7_tmp, Lf7-Latab + .long Lf7_tmp + + .set L1_tmp, L1-Latab + .long L1_tmp + + .set L2_tmp, L2-Latab + .long L2_tmp + + .set L3_tmp, L3-Latab + .long L3_tmp + + .set L4_tmp, L4-Latab + .long L4_tmp + + .set L5_tmp, L5-Latab + .long L5_tmp + + .set L6_tmp, L6-Latab + .long L6_tmp + + .set L7_tmp, L7-Latab + .long L7_tmp + + .set L8_tmp, L8-Latab + .long L8_tmp + + .text + diff --git a/ext/gmp/gen/x86_64-macos/sieve_table.h b/ext/gmp/gen/x86_64-macos/sieve_table.h new file mode 100644 index 0000000000..ee9ac14360 --- /dev/null +++ b/ext/gmp/gen/x86_64-macos/sieve_table.h @@ -0,0 +1,46 @@ +/* This file generated by gen-sieve.c - DO NOT EDIT. */ + +#if GMP_LIMB_BITS != 64 +Error, error, this data is for 64 bits +#endif + +#define PRIMESIEVE_INIT_TABLE \ + CNST_LIMB (0x3294C9E069128480), /* 5 - 196 (42 primes) */ \ + CNST_LIMB (0x95A35E1EC4AB21DC), /* 197 - 388 (32 primes) */ \ + CNST_LIMB (0x4AD7CE99B8693366), /* 389 - 580 (30 primes) */ \ + CNST_LIMB (0x6595B6DA728DC52B), /* 581 - 772 (30 primes) */ \ + CNST_LIMB (0xEA6D9F8787B0CEDE), /* 773 - 964 (26 primes) */ \ + CNST_LIMB (0x3F56A1F4CD3275A9), /* 965 - 1156 (29 primes) */ \ + CNST_LIMB (0xFD3848FB74A76ADB), /* 1157 - 1348 (26 primes) */ \ + CNST_LIMB (0xDBBA0DD1A1EDF6AF), /* 1349 - 1540 (25 primes) */ \ + CNST_LIMB (0xCEC7F17ED22799A5), /* 1541 - 1732 (27 primes) */ \ + CNST_LIMB (0xEAEC17BDBB717D56), /* 1733 - 1924 (24 primes) */ \ + CNST_LIMB (0x3B0EB7B3585AFCF3), /* 1925 - 2116 (26 primes) */ \ + CNST_LIMB (0xE563D8F69FDF6C4F), /* 2117 - 2308 (23 primes) */ \ + CNST_LIMB (0xFE5BA7ABA45E92FC), /* 2309 - 2500 (25 primes) */ \ + CNST_LIMB (0x158DEE6F3BF49B7D), /* 2501 - 2692 (24 primes) */ \ + CNST_LIMB (0xBE5A7BC4EDE6CD1A), /* 2693 - 2884 (26 primes) */ \ + CNST_LIMB (0xD7679B3FCA7BB6AD), /* 2885 - 3076 (22 primes) */ \ + CNST_LIMB (0xC3F66B971FEF37E9), /* 3077 - 3268 (22 primes) */ \ + CNST_LIMB (0x6F7EBCF339C953FD), /* 3269 - 3460 (22 primes) */ \ + CNST_LIMB (0xD5A5ECDCD235DBF0), /* 3461 - 3652 (27 primes) */ \ + CNST_LIMB (0xECFA7B2FD5B65E3B), /* 3653 - 3844 (22 primes) */ \ + CNST_LIMB (0xD28EFDF9C89F67B1), /* 3845 - 4036 (25 primes) */ \ + CNST_LIMB (0xCB7F7C7A3DD3AF4F), /* 4037 - 4228 (21 primes) */ \ + CNST_LIMB (0xEEBED6CDFF6B32CC), /* 4229 - 4420 (22 primes) */ \ + CNST_LIMB (0xD5BD73F85ECFA97C), /* 4421 - 4612 (23 primes) */ \ + CNST_LIMB (0x21FDBE4FBBAD48F7), /* 4613 - 4804 (24 primes) */ \ + CNST_LIMB (0x5E35A3B5EEB7FDE7), /* 4805 - 4996 (21 primes) */ \ + CNST_LIMB (0xD9EBFD53A7DBBCC9), /* 4997 - 5188 (22 primes) */ \ + CNST_LIMB (0xFF9EDEAF2EFE1F76), /* 5189 - 5380 (18 primes) */ +#define PRIMESIEVE_NUMBEROF_TABLE 28 +/* #define PRIMESIEVE_PRIMES_IN_TABLE 706 */ +#define PRIMESIEVE_HIGHEST_PRIME 5351 +/* #define PRIMESIEVE_FIRST_UNCHECKED 5381 */ + +#define SIEVE_MASK1 CNST_LIMB(0x3204C1A049120485) +#define SIEVE_MASKT CNST_LIMB(0xA1204892058) +#define SIEVE_2MSK1 CNST_LIMB(0x29048402110840A) +#define SIEVE_2MSK2 CNST_LIMB(0x9402180C40230184) +#define SIEVE_2MSKT CNST_LIMB(0x5021088402120) + diff --git a/ext/natpmp/build.zig.zon b/ext/natpmp/build.zig.zon index 9f1c3f12a9..76d822b995 100644 --- a/ext/natpmp/build.zig.zon +++ b/ext/natpmp/build.zig.zon @@ -3,7 +3,7 @@ .version = "0.0.1", .dependencies = .{ .natpmp = .{ - .url = "http://www.miniupnp.tuxfamily.org/files/libnatpmp-20230423.tar.gz", + .url = "https://debian.mirror.root.lu/debian/pool/main/libn/libnatpmp/libnatpmp_20230423.orig.tar.gz", .hash = "12203f777796f1df1db24c4194bcc6060d2a7bee2eea88527c2336bbf455d4108239", }, }, diff --git a/ext/softblas/build.zig b/ext/softblas/build.zig new file mode 100644 index 0000000000..1fb3b066b5 --- /dev/null +++ b/ext/softblas/build.zig @@ -0,0 +1,80 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) void { + const target = b.standardTargetOptions(.{}); + const optimize = b.standardOptimizeOption(.{}); + + const lib = b.addStaticLibrary(.{ + .name = "softblas", + .target = target, + .optimize = optimize, + }); + + const dep_c = b.dependency("softblas", .{ + .target = target, + .optimize = optimize, + }); + + const softfloat = b.dependency("softfloat", .{ + .target = target, + .optimize = optimize, + }); + + lib.addIncludePath(dep_c.path("include")); + + lib.addCSourceFiles(.{ + .root = dep_c.path(""), + .files = &.{ + "src/softblas_state.c", + "src/blas/level1/sasum.c", + "src/blas/level1/dasum.c", + "src/blas/level1/hasum.c", + "src/blas/level1/qasum.c", + "src/blas/level1/saxpy.c", + "src/blas/level1/daxpy.c", + "src/blas/level1/haxpy.c", + "src/blas/level1/qaxpy.c", + "src/blas/level1/scopy.c", + "src/blas/level1/dcopy.c", + "src/blas/level1/hcopy.c", + "src/blas/level1/qcopy.c", + "src/blas/level1/sdot.c", + "src/blas/level1/ddot.c", + "src/blas/level1/hdot.c", + "src/blas/level1/qdot.c", + "src/blas/level1/snrm2.c", + "src/blas/level1/dnrm2.c", + "src/blas/level1/hnrm2.c", + "src/blas/level1/qnrm2.c", + "src/blas/level1/sscal.c", + "src/blas/level1/dscal.c", + "src/blas/level1/hscal.c", + "src/blas/level1/qscal.c", + "src/blas/level1/sswap.c", + "src/blas/level1/dswap.c", + "src/blas/level1/hswap.c", + "src/blas/level1/qswap.c", + "src/blas/level1/isamax.c", + "src/blas/level1/idamax.c", + "src/blas/level1/ihamax.c", + "src/blas/level1/iqamax.c", + "src/blas/level2/sgemv.c", + "src/blas/level2/dgemv.c", + "src/blas/level2/hgemv.c", + "src/blas/level2/qgemv.c", + "src/blas/level3/sgemm.c", + "src/blas/level3/dgemm.c", + "src/blas/level3/hgemm.c", + "src/blas/level3/qgemm.c", + }, + .flags = &.{ + "-fno-sanitize=all", + }, + }); + + lib.installHeader(dep_c.path("include/softblas.h"), "softblas.h"); + + lib.linkLibC(); + lib.linkLibrary(softfloat.artifact("softfloat")); + b.installArtifact(lib); +} diff --git a/ext/softblas/build.zig.zon b/ext/softblas/build.zig.zon new file mode 100644 index 0000000000..431171b53b --- /dev/null +++ b/ext/softblas/build.zig.zon @@ -0,0 +1,16 @@ +.{ + .name = "softblas", + .version = "0.0.1", + .dependencies = .{ + .softfloat = .{ + .path = "../softfloat", + }, + .softblas = .{ + .url = "https://github.com/urbit/SoftBLAS/archive/cbffb33f19ea02f9ffbd184d445123c57929ec53.tar.gz", + .hash = "1220617c11d869ef2316571a430f51f93470e2d714141deb3bdfaa6b578cf151f258", + }, + }, + .paths = .{ + "", + }, +} diff --git a/pkg/c3/motes.h b/pkg/c3/motes.h index 9d34561c09..daf5b90896 100644 --- a/pkg/c3/motes.h +++ b/pkg/c3/motes.h @@ -259,6 +259,7 @@ # define c3__corp c3_s4('c','o','r','p') # define c3__corp c3_s4('c','o','r','p') # define c3__cow c3_s3('c','o','w') +# define c3__cplx c3_s3('c','p','l','x') # define c3__cpu c3_s3('c','p','u') # define c3__crad c3_s4('c','r','a','d') # define c3__cram c3_s4('c','r','a','m') @@ -431,6 +432,7 @@ # define c3__fit c3_s3('f','i','t') # define c3__fits c3_s4('f','i','t','s') # define c3__fix c3_s3('f','i','x') +# define c3__fixp c3_s3('f','i','x','p') # define c3__fl c3_s2('f','l') # define c3__flac c3_s4('f','l','a','c') # define c3__flag c3_s4('f','l','a','g') @@ -605,6 +607,7 @@ # define c3__info c3_s4('i','n','f','o') # define c3__init c3_s4('i','n','i','t') # define c3__ins c3_s3('i','n','s') +# define c3__int2 c3_s4('i','n','t','2') # define c3__into c3_s4('i','n','t','o') # define c3__intr c3_s4('i','n','t','r') # define c3__inuk c3_s4('i','n','u','k') @@ -613,6 +616,7 @@ # define c3__is c3_s2('i','s') # define c3__item c3_s4('i','t','e','m') # define c3__ix c3_s2('i','x') +# define c3__i754 c3_s4('i','7','5','4') # define c3__j c3_s1('j') # define c3__jack c3_s4('j','a','c','k') # define c3__jam c3_s3('j','a','m') @@ -981,6 +985,7 @@ # define c3__rasp c3_s4('r','a','s','p') # define c3__raw c3_s3('r','a','w') # define c3__read c3_s4('r','e','a','d') +# define c3__real c3_s4('r','e','a','l') # define c3__reck c3_s4('r','e','c','k') # define c3__reef c3_s4('r','e','e','f') # define c3__resd c3_s4('r','e','s','d') @@ -1244,11 +1249,13 @@ # define c3__ubin c3_s4('u','b','i','n') # define c3__ubit c3_s4('u','b','i','t') # define c3__ud c3_s2('u','d') +# define c3__uint c3_s4('u','i','n','t') # define c3__ulib c3_s4('u','l','i','b') # define c3__un c3_s2('u','n') # define c3__uniq c3_s4('u','n','i','q') # define c3__unix c3_s4('u','n','i','x') # define c3__unt c3_s3('u','n','t') +# define c3__unum c3_s3('u','n','u','m') # define c3__up c3_s2('u','p') # define c3__url c3_s3('u','r','l') # define c3__urth c3_s4('u','r','t','h') diff --git a/pkg/noun/allocate.c b/pkg/noun/allocate.c index eaa1527e61..3e00f21f19 100644 --- a/pkg/noun/allocate.c +++ b/pkg/noun/allocate.c @@ -2006,9 +2006,8 @@ u3a_maid(FILE* fil_u, c3_c* cap_c, c3_w wor_w) /* _ca_print_memory(): un-captioned u3a_print_memory(). */ static void -_ca_print_memory(FILE* fil_u, c3_w wor_w) +_ca_print_memory(FILE* fil_u, c3_w byt_w) { - c3_w byt_w = (wor_w * 4); c3_w gib_w = (byt_w / 1000000000); c3_w mib_w = (byt_w % 1000000000) / 1000000; c3_w kib_w = (byt_w % 1000000) / 1000; @@ -2029,43 +2028,55 @@ _ca_print_memory(FILE* fil_u, c3_w wor_w) } } +/* u3a_quac_free: free quac memory. +*/ +void +u3a_quac_free(u3m_quac* qua_u) +{ + c3_w i_w = 0; + while ( qua_u->qua_u[i_w] != NULL ) { + u3a_quac_free(qua_u->qua_u[i_w]); + i_w++; + } + c3_free(qua_u->nam_c); + c3_free(qua_u->qua_u); + c3_free(qua_u); +} + /* u3a_prof(): mark/measure/print memory profile. RETAIN. */ -c3_w -u3a_prof(FILE* fil_u, c3_w den_w, u3_noun mas) +u3m_quac* +u3a_prof(FILE* fil_u, u3_noun mas) { - c3_w tot_w = 0; + u3m_quac* pro_u = c3_calloc(sizeof(*pro_u)); u3_noun h_mas, t_mas; if ( c3n == u3r_cell(mas, &h_mas, &t_mas) ) { - fprintf(fil_u, "%.*smistyped mass\r\n", den_w, ""); - return tot_w; + fprintf(fil_u, "mistyped mass\r\n"); + c3_free(pro_u); + return NULL; } - else if ( _(u3du(h_mas)) ) { - fprintf(fil_u, "%.*smistyped mass head\r\n", den_w, ""); + else if ( c3y == u3du(h_mas) ) { + fprintf(fil_u, "mistyped mass head\r\n"); { c3_c* lab_c = u3m_pretty(h_mas); fprintf(fil_u, "h_mas: %s", lab_c); c3_free(lab_c); } - return tot_w; + c3_free(pro_u); + return NULL; } else { - { - c3_c* lab_c = u3m_pretty(h_mas); - fprintf(fil_u, "%*s%s: ", den_w, "", lab_c); - c3_free(lab_c); - } u3_noun it_mas, tt_mas; if ( c3n == u3r_cell(t_mas, &it_mas, &tt_mas) ) { - fprintf(fil_u, "%*smistyped mass tail\r\n", den_w, ""); - return tot_w; + fprintf(fil_u, "mistyped mass tail\r\n"); + c3_free(pro_u); + return NULL; } else if ( c3y == it_mas ) { - tot_w += u3a_mark_noun(tt_mas); - _ca_print_memory(fil_u, tot_w); + c3_w siz_w = u3a_mark_noun(tt_mas); #if 1 /* The basic issue here is that tt_mas is included in .sac @@ -2076,7 +2087,7 @@ u3a_prof(FILE* fil_u, c3_w den_w, u3_noun mas) * * see u3a_mark_ptr(). */ - if ( _(u3a_is_dog(tt_mas)) ) { + if ( c3y == u3a_is_dog(tt_mas) ) { u3a_box* box_u = u3a_botox(u3a_to_ptr(tt_mas)); #ifdef U3_MEMORY_DEBUG if ( 1 == box_u->eus_w ) { @@ -2095,45 +2106,131 @@ u3a_prof(FILE* fil_u, c3_w den_w, u3_noun mas) #endif } #endif + pro_u->nam_c = u3r_string(h_mas); + pro_u->siz_w = siz_w*4; + pro_u->qua_u = NULL; + return pro_u; - return tot_w; } else if ( c3n == it_mas ) { - fprintf(fil_u, "\r\n"); - - while ( _(u3du(tt_mas)) ) { - tot_w += u3a_prof(fil_u, den_w+2, u3h(tt_mas)); + pro_u->qua_u = c3_malloc(sizeof(pro_u->qua_u)); + c3_w i_w = 0; + c3_t bad_t = 0; + while ( c3y == u3du(tt_mas) ) { + u3m_quac* new_u = u3a_prof(fil_u, u3h(tt_mas)); + if ( NULL == new_u ) { + bad_t = 1; + } else { + pro_u->qua_u = c3_realloc(pro_u->qua_u, (i_w + 2) * sizeof(pro_u->qua_u)); + pro_u->siz_w += new_u->siz_w; + pro_u->qua_u[i_w] = new_u; + } tt_mas = u3t(tt_mas); + i_w++; } + pro_u->qua_u[i_w] = NULL; - fprintf(fil_u, "%*s--", den_w, ""); - _ca_print_memory(fil_u, tot_w); - - return tot_w; - + if ( bad_t ) { + i_w = 0; + while ( pro_u->qua_u[i_w] != NULL ) { + u3a_quac_free(pro_u->qua_u[i_w]); + i_w++; + } + c3_free(pro_u->qua_u); + c3_free(pro_u); + return NULL; + } else { + pro_u->nam_c = u3r_string(h_mas); + return pro_u; + } } else { - fprintf(fil_u, "%*smistyped (strange) mass tail\r\n", den_w, ""); - return tot_w; + fprintf(fil_u, "mistyped (strange) mass tail\r\n"); + c3_free(pro_u); + return NULL; + } + } +} + + +/* u3a_print_quac: print a memory report. +*/ + +void +u3a_print_quac(FILE* fil_u, c3_w den_w, u3m_quac* mas_u) +{ + u3_assert( 0 != fil_u ); + + if ( mas_u->siz_w ) { + fprintf(fil_u, "%*s%s: ", den_w, "", mas_u->nam_c); + + if ( mas_u->qua_u == NULL ) { + _ca_print_memory(fil_u, mas_u->siz_w); + } else { + fprintf(fil_u, "\r\n"); + c3_w i_w = 0; + while ( mas_u->qua_u[i_w] != NULL ) { + u3a_print_quac(fil_u, den_w+2, mas_u->qua_u[i_w]); + i_w++; + } + fprintf(fil_u, "%*s--", den_w, ""); + _ca_print_memory(fil_u, mas_u->siz_w); } } } /* u3a_mark_road(): mark ad-hoc persistent road structures. */ -c3_w -u3a_mark_road(FILE* fil_u) -{ - c3_w tot_w = 0; - tot_w += u3a_maid(fil_u, " namespace", u3a_mark_noun(u3R->ski.gul)); - tot_w += u3a_maid(fil_u, " trace stack", u3a_mark_noun(u3R->bug.tax)); - tot_w += u3a_maid(fil_u, " trace buffer", u3a_mark_noun(u3R->bug.mer)); - tot_w += u3a_maid(fil_u, " profile batteries", u3a_mark_noun(u3R->pro.don)); - tot_w += u3a_maid(fil_u, " profile doss", u3a_mark_noun(u3R->pro.day)); - tot_w += u3a_maid(fil_u, " new profile trace", u3a_mark_noun(u3R->pro.trace)); - tot_w += u3a_maid(fil_u, " transient memoization cache", u3h_mark(u3R->cax.har_p)); - tot_w += u3a_maid(fil_u, " persistent memoization cache", u3h_mark(u3R->cax.per_p)); - return u3a_maid(fil_u, "total road stuff", tot_w); +u3m_quac* +u3a_mark_road() +{ + u3m_quac** qua_u = c3_malloc(sizeof(*qua_u) * 9); + + qua_u[0] = c3_calloc(sizeof(*qua_u[0])); + qua_u[0]->nam_c = strdup("namespace"); + qua_u[0]->siz_w = u3a_mark_noun(u3R->ski.gul) * 4; + + qua_u[1] = c3_calloc(sizeof(*qua_u[1])); + qua_u[1]->nam_c = strdup("trace stack"); + qua_u[1]->siz_w = u3a_mark_noun(u3R->bug.tax) * 4; + + qua_u[2] = c3_calloc(sizeof(*qua_u[2])); + qua_u[2]->nam_c = strdup("trace buffer"); + qua_u[2]->siz_w = u3a_mark_noun(u3R->bug.mer) * 4; + + qua_u[3] = c3_calloc(sizeof(*qua_u[3])); + qua_u[3]->nam_c = strdup("profile batteries"); + qua_u[3]->siz_w = u3a_mark_noun(u3R->pro.don) * 4; + + qua_u[4] = c3_calloc(sizeof(*qua_u[4])); + qua_u[4]->nam_c = strdup("profile doss"); + qua_u[4]->siz_w = u3a_mark_noun(u3R->pro.day) * 4; + + qua_u[5] = c3_calloc(sizeof(*qua_u[5])); + qua_u[5]->nam_c = strdup("new profile trace"); + qua_u[5]->siz_w = u3a_mark_noun(u3R->pro.trace) * 4; + + qua_u[6] = c3_calloc(sizeof(*qua_u[6])); + qua_u[6]->nam_c = strdup("transient memoization cache"); + qua_u[6]->siz_w = u3h_mark(u3R->cax.har_p) * 4; + + qua_u[7] = c3_calloc(sizeof(*qua_u[7])); + qua_u[7]->nam_c = strdup("persistent memoization cache"); + qua_u[7]->siz_w = u3h_mark(u3R->cax.per_p) * 4; + + qua_u[8] = NULL; + + c3_w sum_w = 0; + for (c3_w i_w = 0; i_w < 8; i_w++) { + sum_w += qua_u[i_w]->siz_w; + } + + u3m_quac* tot_u = c3_malloc(sizeof(*tot_u)); + tot_u->nam_c = strdup("total road stuff"); + tot_u->siz_w = sum_w; + tot_u->qua_u = qua_u; + + return tot_u; } /* u3a_reclaim(): clear ad-hoc persistent caches to reclaim memory. diff --git a/pkg/noun/allocate.h b/pkg/noun/allocate.h index 0fa78cdaf2..3f71f42343 100644 --- a/pkg/noun/allocate.h +++ b/pkg/noun/allocate.h @@ -606,8 +606,8 @@ /* u3a_mark_road(): mark ad-hoc persistent road structures. */ - c3_w - u3a_mark_road(FILE* fil_u); + u3m_quac* + u3a_mark_road(); /* u3a_reclaim(): clear ad-hoc persistent caches to reclaim memory. */ @@ -702,21 +702,35 @@ void u3a_print_time(c3_c* str_c, c3_c* cap_c, c3_d mic_d); + /* u3a_print_quac: print a quac memory report. + */ + void + u3a_print_quac(FILE* fil_u, c3_w den_w, u3m_quac* mas_u); + /* u3a_print_memory(): print memory amount. */ void u3a_print_memory(FILE* fil_u, c3_c* cap_c, c3_w wor_w); - /* u3a_prof(): mark/measure/print memory profile. RETAIN. */ - c3_w - u3a_prof(FILE* fil_u, c3_w den_w, u3_noun mas); + u3m_quac* + u3a_prof(FILE* fil_u, u3_noun mas); /* u3a_maid(): maybe print memory. */ c3_w u3a_maid(FILE* fil_u, c3_c* cap_c, c3_w wor_w); + /* u3a_quac_free(): free quac memory. + */ + void + u3a_quac_free(u3m_quac* qua_u); + + /* u3a_uncap_print_memory(): un-captioned print memory amount. + */ + void + u3a_uncap_print_memory(FILE* fil_u, c3_w byt_w); + /* u3a_deadbeef(): write 0xdeadbeef from hat to cap. */ void diff --git a/pkg/noun/jets.c b/pkg/noun/jets.c index 52e72b340d..200f7d46ef 100644 --- a/pkg/noun/jets.c +++ b/pkg/noun/jets.c @@ -2305,27 +2305,61 @@ _cj_mark_hank(u3_noun kev, void* dat) /* u3j_mark(): mark jet state for gc. */ -c3_w -u3j_mark(FILE* fil_u) +u3m_quac* +u3j_mark() { - c3_w tot_w = 0; + u3m_quac** qua_u = c3_malloc(sizeof(*qua_u) * 7); - tot_w += u3a_maid(fil_u, " warm jet state", u3h_mark(u3R->jed.war_p)); - tot_w += u3a_maid(fil_u, " cold jet state", u3h_mark(u3R->jed.cod_p)); - tot_w += u3a_maid(fil_u, " hank cache", u3h_mark(u3R->jed.han_p)); - tot_w += u3a_maid(fil_u, " battery hash cache", u3h_mark(u3R->jed.bas_p)); + qua_u[0] = c3_calloc(sizeof(*qua_u[0])); + qua_u[0]->nam_c = strdup("warm jet state"); + qua_u[0]->siz_w = u3h_mark(u3R->jed.war_p) * 4; - { - c3_w han_w = 0; - u3h_walk_with(u3R->jed.han_p, _cj_mark_hank, &han_w); - tot_w += u3a_maid(fil_u, " call site cache", han_w); + qua_u[1] = c3_calloc(sizeof(*qua_u[1])); + qua_u[1]->nam_c = strdup("cold jet state"); + qua_u[1]->siz_w = u3h_mark(u3R->jed.cod_p) * 4; + + qua_u[2] = c3_calloc(sizeof(*qua_u[2])); + qua_u[2]->nam_c = strdup("hank cache"); + qua_u[2]->siz_w = u3h_mark(u3R->jed.han_p) * 4; + + qua_u[3] = c3_calloc(sizeof(*qua_u[3])); + qua_u[3]->nam_c = strdup("battery hash cache"); + qua_u[3]->siz_w = u3h_mark(u3R->jed.bas_p) * 4; + + qua_u[4] = c3_calloc(sizeof(*qua_u[4])); + qua_u[4]->nam_c = strdup("call site cache"); + u3h_walk_with(u3R->jed.han_p, _cj_mark_hank, &qua_u[4]->siz_w); + qua_u[4]->siz_w *= 4; + + c3_w sum_w = 0; + for ( c3_w i_w = 0; i_w < 5; i_w++ ) { + sum_w += qua_u[i_w]->siz_w; } + u3m_quac* tot_u = c3_calloc(sizeof(*tot_u)); + tot_u->nam_c = strdup("total jet stuff"); + if ( u3R == &(u3H->rod_u) ) { - tot_w += u3a_maid(fil_u, " hot jet state", u3h_mark(u3R->jed.hot_p)); - } + qua_u[5] = c3_calloc(sizeof(*qua_u[5])); + qua_u[5]->nam_c = strdup("hot jet state"); + qua_u[5]->siz_w = u3h_mark(u3R->jed.hot_p) * 4; + + sum_w += qua_u[5]->siz_w; + + qua_u[6] = NULL; + + tot_u->siz_w = sum_w; + tot_u->qua_u = qua_u; - return u3a_maid(fil_u, "total jet stuff", tot_w); + return tot_u; + } else { + qua_u[5] = NULL; + + tot_u->siz_w = sum_w; + tot_u->qua_u = qua_u; + + return tot_u; + } } /* u3j_free_hank(): free an entry from the hank cache. diff --git a/pkg/noun/jets.h b/pkg/noun/jets.h index bfaa68aeaa..c81ab18318 100644 --- a/pkg/noun/jets.h +++ b/pkg/noun/jets.h @@ -296,8 +296,8 @@ /* u3j_mark(): mark jet state for gc. */ - c3_w - u3j_mark(FILE* fil_u); + u3m_quac* + u3j_mark(); /* u3j_free(): free jet state. */ diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c new file mode 100644 index 0000000000..db7088b96e --- /dev/null +++ b/pkg/noun/jets/i/lagoon.c @@ -0,0 +1,3315 @@ +/// @file + +#include "jets/q.h" +#include "jets/w.h" + +#include "c3/motes.h" + +#include "noun.h" +#include "softfloat.h" +#include "softblas.h" + +#include // for pow() +#include + +#define f16_ceil(a) f16_roundToInt( a, softfloat_round_max, false ) +#define f32_ceil(a) f32_roundToInt( a, softfloat_round_max, false ) +#define f64_ceil(a) f64_roundToInt( a, softfloat_round_max, false ) +#define f128M_ceil(a, b) f128M_roundToInt( a, softfloat_round_max, false, b ) + + union half { + float16_t h; + c3_w c; + }; + + union sing { + float32_t s; + c3_w c; + }; + + union doub { + float64_t d; + c3_d c; + }; + + union quad { + float128_t q; + c3_d c[2]; + }; + + // $?(%n %u %d %z %a) + static inline void + _set_rounding(c3_w a) + { + // We could use SoftBLAS set_rounding() to set the SoftFloat + // mode as well, but it's more explicit to do it here since + // we may use SoftFloat in any given Lagoon jet and we want + // you, dear developer, to see it set here. + switch ( a ) + { + default: + u3m_bail(c3__fail); + break; + // %n - near + case c3__n: + softfloat_roundingMode = softfloat_round_near_even; + softblas_roundingMode = 'n'; + break; + // %z - zero + case c3__z: + softfloat_roundingMode = softfloat_round_minMag; + softblas_roundingMode = 'z'; + break; + // %u - up + case c3__u: + softfloat_roundingMode = softfloat_round_max; + softblas_roundingMode = 'u'; + break; + // %d - down + case c3__d: + softfloat_roundingMode = softfloat_round_min; + softblas_roundingMode = 'd'; + break; + // %a - away + case c3__a: + softfloat_roundingMode = softfloat_round_near_maxMag; + softblas_roundingMode = 'a'; + break; + } + } + +/* length of shape = x * y * z * w * ... +*/ + static inline c3_d _get_length(u3_noun shape) + { + c3_d len = 1; + while (u3_nul != shape) { + len = len * u3x_atom(u3h(shape)); + shape = u3t(shape); + } + return len; + } + +/* get dims from shape as array [x y z w ...] +*/ + static inline c3_d* _get_dims(u3_noun shape) + { + u3_atom len = u3qb_lent(shape); + c3_d len_d = u3r_chub(0, len); + c3_d* dims = (c3_d*)u3a_malloc(len_d*sizeof(c3_d)); + for (c3_d i = 0; i < len_d; i++) { + dims[i] = u3r_chub(0, u3x_atom(u3h(shape))); + shape = u3t(shape); + } + u3z(len); + return dims; + } + +/* check consistency of array shape and bloq size + |= =ray + ^- ? + .= (roll shape.meta.ray ^mul) + (dec (met bloq.meta.ray data.ray)) +*/ + static inline c3_o _check(u3_noun ray) + { + // Calculate expected size. + u3_atom shp = u3h(u3h(ray)); // (reported) shape of ray, +4 + u3_atom blq = u3h(u3t(u3h(ray))); // block size of ray, +10 + u3_atom sin = _get_length(shp); // calculated length of ray + + // Calculate actual size. + u3_atom len = u3r_met(blq, u3t(ray)); // length of ray + u3_atom dex = u3qa_dec(len); // decrement length b/c of pinned 1 + + return __(sin == dex); + } + +/* add - axpy = 1*x+y +*/ + u3_noun + u3qi_la_add_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq + ) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + haxpy(len_x, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + break; + + case 5: + saxpy(len_x, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + break; + + case 6: + daxpy(len_x, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + break; + + case 7: + qaxpy(len_x, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* sub - axpy = -1*y+x +*/ + u3_noun + u3qi_la_sub_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq + ) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + haxpy(len_x, (float16_t){SB_REAL16_NEGONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + break; + + case 5: + saxpy(len_x, (float32_t){SB_REAL32_NEGONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + break; + + case 6: + daxpy(len_x, (float64_t){SB_REAL64_NEGONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + break; + + case 7: + qaxpy(len_x, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + + +/* mul - x.*y + elementwise multiplication +*/ + u3_noun + u3qi_la_mul_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + ((float16_t*)y_bytes)[i] = f16_mul(((float16_t*)x_bytes)[i], ((float16_t*)y_bytes)[i]); + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + ((float32_t*)y_bytes)[i] = f32_mul(((float32_t*)x_bytes)[i], ((float32_t*)y_bytes)[i]); + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + ((float64_t*)y_bytes)[i] = f64_mul(((float64_t*)x_bytes)[i], ((float64_t*)y_bytes)[i]); + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + f128M_mul(&(((float128_t*)y_bytes)[i]), &(((float128_t*)x_bytes)[i]), &(((float128_t*)y_bytes)[i])); + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* div - x/y + elementwise division +*/ + u3_noun + u3qi_la_div_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + ((float16_t*)y_bytes)[i] = f16_div(((float16_t*)x_bytes)[i], ((float16_t*)y_bytes)[i]); + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + ((float32_t*)y_bytes)[i] = f32_div(((float32_t*)x_bytes)[i], ((float32_t*)y_bytes)[i]); + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + ((float64_t*)y_bytes)[i] = f64_div(((float64_t*)x_bytes)[i], ((float64_t*)y_bytes)[i]); + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + f128M_div(&(((float128_t*)y_bytes)[i]), &(((float128_t*)x_bytes)[i]), &(((float128_t*)y_bytes)[i])); + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* mod - x % y = x - r*floor(x/r) + remainder after division +*/ + u3_noun + u3qi_la_mod_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + float16_t y_val16 = ((float16_t*)y_bytes)[i]; + // Perform division x/n + float16_t div_result16 = f16_div(x_val16, y_val16); + // Compute floor of the division result + c3_ds floor_result16 = f16_to_i64(div_result16, softfloat_round_minMag, false); + float16_t floor_float16 = i64_to_f16(floor_result16); + // Multiply n by floor(x/n) + float16_t mult_result16 = f16_mul(y_val16, floor_float16); + // Compute remainder: x - n * floor(x/n) + ((float16_t*)y_bytes)[i] = f16_sub(x_val16, mult_result16); + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + float32_t y_val32 = ((float32_t*)y_bytes)[i]; + // Perform division x/n + float32_t div_result32 = f32_div(x_val32, y_val32); + // Compute floor of the division result + c3_ds floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false); + float32_t floor_float32 = i64_to_f32(floor_result32); + // Multiply n by floor(x/n) + float32_t mult_result32 = f32_mul(y_val32, floor_float32); + // Compute remainder: x - n * floor(x/n) + ((float32_t*)y_bytes)[i] = f32_sub(x_val32, mult_result32); + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + float64_t y_val64 = ((float64_t*)y_bytes)[i]; + // Perform division x/n + float64_t div_result64 = f64_div(x_val64, y_val64); + // Compute floor of the division result + c3_ds floor_result64 = f64_to_i64(div_result64, softfloat_round_minMag, false); + float64_t floor_float64 = i64_to_f64(floor_result64); + // Multiply n by floor(x/n) + float64_t mult_result64 = f64_mul(y_val64, floor_float64); + // Compute remainder: x - n * floor(x/n) + ((float64_t*)y_bytes)[i] = f64_sub(x_val64, mult_result64); + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + float128_t y_val128 = ((float128_t*)y_bytes)[i]; + // Perform division x/n + float128_t div_result128; + f128M_div((float128_t*)&x_val128, (float128_t*)&y_val128, (float128_t*)&div_result128); + // Compute floor of the division result + c3_ds floor_result128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false); + float128_t floor_float128; + i64_to_f128M(floor_result128, &floor_float128); + // Multiply n by floor(x/n) + float128_t mult_result128; + f128M_mul(((float128_t*)&y_val128), ((float128_t*)&floor_float128), ((float128_t*)&mult_result128)); + // Compute remainder: x - n * floor(x/n) + f128M_sub(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)y_bytes)[i])); + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* cumsum - x[0] + x[1] + ... x[n] +*/ + u3_noun + u3qi_la_cumsum_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // y_bytes is the data array (w/ leading 0x1, skipped by for range) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + + u3_noun r_data; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: { + float16_t sum16[2]; + sum16[0] = (float16_t){SB_REAL16_ZERO}; + for (c3_d i = len_x; i > 0; i--) { + sum16[0] = f16_add(sum16[0], ((float16_t*)x_bytes)[i-1]); + } + sum16[1].v = 0x1; + r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)sum16); + break;} + + case 5: { + float32_t sum32[2]; + sum32[0] = (float32_t){SB_REAL32_ZERO}; + for (c3_d i = len_x; i > 0; i--) { + sum32[0] = f32_add(sum32[0], ((float32_t*)x_bytes)[i-1]); + } + sum32[1].v = 0x1; + r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)sum32); + break;} + + case 6: { + float64_t sum64[2]; + sum64[0] = (float64_t){SB_REAL64_ZERO}; + for (c3_d i = len_x; i > 0; i--) { + sum64[0] = f64_add(sum64[0], ((float64_t*)x_bytes)[i-1]); + } + sum64[1].v = 0x1; + r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)sum64); + break;} + + case 7: { + float128_t sum128[2]; + sum128[0] = (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; + for (c3_d i = len_x; i > 0; i--) { + f128M_add(&(sum128[0]), &(((float128_t*)x_bytes)[i-1]), &(sum128[0])); + } + sum128[1] = (float128_t){0x1, 0x0}; + r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)sum128); + break;} + } + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* argmin - argmin(x) +*/ + u3_noun + u3qi_la_argmin_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1, which doesn't matter here) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + c3_w min_idx = 0; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: { + float16_t min_val16 = ((float16_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f16_lt(((float16_t*)x_bytes)[i], min_val16)) { + min_val16 = ((float16_t*)x_bytes)[i]; + min_idx = (len_x - i - 1); + } + } + break;} + + case 5: { + float32_t min_val32 = ((float32_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f32_lt(((float32_t*)x_bytes)[i], min_val32)) { + min_val32 = ((float32_t*)x_bytes)[i]; + min_idx = (len_x - i - 1); + } + } + break;} + + case 6: { + float64_t min_val64 = ((float64_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f64_lt(((float64_t*)x_bytes)[i], min_val64)) { + min_val64 = ((float64_t*)x_bytes)[i]; + min_idx = (len_x - i - 1); + } + } + break;} + + case 7: { + float128_t min_val128 = ((float128_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f128M_lt(&(((float128_t*)x_bytes)[i]), &min_val128)) { + min_val128 = *f128M_min(&min_val128, &((float128_t*)x_bytes)[i]); + min_idx = (len_x - i - 1); + } + } + break;} + } + + u3_noun r_data = u3i_chub(min_idx); + + return r_data; + } + +/* argmax - argmax(x) +*/ + u3_noun + u3qi_la_argmax_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1, which doesn't matter here) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + c3_w max_idx = 0; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: { + float16_t max_val16 = ((float16_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f16_gt(((float16_t*)x_bytes)[i], max_val16)) { + max_val16 = ((float16_t*)x_bytes)[i]; + max_idx = (len_x - i - 1); + } + } + break;} + + case 5: { + float32_t max_val32 = ((float32_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f32_gt(((float32_t*)x_bytes)[i], max_val32)) { + max_val32 = ((float32_t*)x_bytes)[i]; + max_idx = (len_x - i - 1); + } + } + break;} + + case 6: { + float64_t max_val64 = ((float64_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f64_gt(((float64_t*)x_bytes)[i], max_val64)) { + max_val64 = ((float64_t*)x_bytes)[i]; + max_idx = (len_x - i - 1); + } + } + break;} + + case 7: { + float128_t max_val128 = ((float128_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f128M_gt(&(((float128_t*)x_bytes)[i]), &max_val128)) { + max_val128 = *f128M_max(&max_val128, &((float128_t*)x_bytes)[i]); + max_idx = (len_x - i - 1); + } + } + break;} + } + + u3_noun r_data = u3i_chub(max_idx); + + return r_data; + } + +/* ravel - x -> ~[x[0], x[1], ... x[n]] + entire nd-array busted out as a linear list +*/ + u3_noun + u3qi_la_ravel_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // r_data is the result noun of [data] + u3_noun r_data = u3_nul; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + r_data = u3nc(u3i_word(x_val16.v), r_data); + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + r_data = u3nc(u3i_word(x_val32.v), r_data); + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + r_data = u3nc(u3i_chub(x_val64.v), r_data); + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + r_data = u3nc(u3i_chubs(2, (c3_d*)&(x_val128.v)), r_data); + } + break; + } + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* min - min(x,y) +*/ + u3_noun + u3qi_la_min_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by for range) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + + u3_noun r_data; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: { + float16_t min_val16 = ((float16_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + min_val16 = f16_min(min_val16, ((float16_t*)x_bytes)[i]); + } + float16_t r16[2]; + r16[0] = min_val16; + r16[1].v = 0x1; + r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)r16); + break;} + + case 5: { + float32_t min_val32 = ((float32_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + min_val32 = f32_min(min_val32, ((float32_t*)x_bytes)[i]); + } + float32_t r32[2]; + r32[0] = min_val32; + r32[1].v = 0x1; + r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)r32); + break;} + + case 6: { + float64_t min_val64 = ((float64_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + min_val64 = f64_min(min_val64, ((float64_t*)x_bytes)[i]); + } + float64_t r64[2]; + r64[0] = min_val64; + r64[1].v = 0x1; + r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)r64); + break;} + + case 7: { + float128_t min_val128 = ((float128_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + min_val128 = *f128M_min(&min_val128, &((float128_t*)x_bytes)[i]); + } + float128_t r128[2]; + r128[0] = min_val128; + r128[1] = (float128_t){0x1, 0x0}; + r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)r128); + break;} + } + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* max - max(x,y) +*/ + u3_noun + u3qi_la_max_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by for range) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + + u3_noun r_data; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: { + float16_t max_val16 = ((float16_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + max_val16 = f16_max(max_val16, ((float16_t*)x_bytes)[i]); + } + float16_t r16[2]; + r16[0] = max_val16; + r16[1].v = 0x1; + r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)r16); + break;} + + case 5: { + float32_t max_val32 = ((float32_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + max_val32 = f32_max(max_val32, ((float32_t*)x_bytes)[i]); + } + float32_t r32[2]; + r32[0] = max_val32; + r32[1].v = 0x1; + r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)r32); + break;} + + case 6: { + float64_t max_val64 = ((float64_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + max_val64 = f64_max(max_val64, ((float64_t*)x_bytes)[i]); + } + float64_t r64[2]; + r64[0] = max_val64; + r64[1].v = 0x1; + r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)r64); + break;} + + case 7: { + float128_t max_val128 = ((float128_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + max_val128 = *f128M_max(&max_val128, &((float128_t*)x_bytes)[i]); + } + float128_t r128[2]; + r128[0] = max_val128; + r128[1] = (float128_t){0x1, 0x0}; + r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)r128); + break;} + } + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* abs - |x| +*/ + u3_noun + u3qi_la_abs_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by for range) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + ((float16_t*)x_bytes)[i] = f16_abs(((float16_t*)x_bytes)[i]); + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + ((float32_t*)x_bytes)[i] = f32_abs(((float32_t*)x_bytes)[i]); + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + ((float64_t*)x_bytes)[i] = f64_abs(((float64_t*)x_bytes)[i]); + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + ((float128_t*)x_bytes)[i] = f128_abs(((float128_t*)x_bytes)[i]); + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* gth - x > y +*/ + u3_noun + u3qi_la_gth_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + float16_t y_val16 = ((float16_t*)y_bytes)[i]; + ((float16_t*)y_bytes)[i] = f16_gt(x_val16, y_val16) ? (float16_t){SB_REAL16_ONE} : (float16_t){SB_REAL16_ZERO}; + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + float32_t y_val32 = ((float32_t*)y_bytes)[i]; + ((float32_t*)y_bytes)[i] = f32_gt(x_val32, y_val32) ? (float32_t){SB_REAL32_ONE} : (float32_t){SB_REAL32_ZERO}; + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + float64_t y_val64 = ((float64_t*)y_bytes)[i]; + ((float64_t*)y_bytes)[i] = f64_gt(x_val64, y_val64) ? (float64_t){SB_REAL64_ONE} : (float64_t){SB_REAL64_ZERO}; + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + float128_t y_val128 = ((float128_t*)y_bytes)[i]; + ((float128_t*)y_bytes)[i] = f128M_gt(((float128_t*)&x_val128), ((float128_t*)&y_val128)) ? (float128_t){SB_REAL128L_ONE, SB_REAL128U_ONE} : (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* gte - x > y +*/ + u3_noun + u3qi_la_gte_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + float16_t y_val16 = ((float16_t*)y_bytes)[i]; + ((float16_t*)y_bytes)[i] = f16_ge(x_val16, y_val16) ? (float16_t){SB_REAL16_ONE} : (float16_t){SB_REAL16_ZERO}; + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + float32_t y_val32 = ((float32_t*)y_bytes)[i]; + ((float32_t*)y_bytes)[i] = f32_ge(x_val32, y_val32) ? (float32_t){SB_REAL32_ONE} : (float32_t){SB_REAL32_ZERO}; + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + float64_t y_val64 = ((float64_t*)y_bytes)[i]; + ((float64_t*)y_bytes)[i] = f64_ge(x_val64, y_val64) ? (float64_t){SB_REAL64_ONE} : (float64_t){SB_REAL64_ZERO}; + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + float128_t y_val128 = ((float128_t*)y_bytes)[i]; + ((float128_t*)y_bytes)[i] = f128M_ge(((float128_t*)&x_val128), ((float128_t*)&y_val128)) ? (float128_t){SB_REAL128L_ONE, SB_REAL128U_ONE} : (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* lth - x > y +*/ + u3_noun + u3qi_la_lth_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + float16_t y_val16 = ((float16_t*)y_bytes)[i]; + ((float16_t*)y_bytes)[i] = f16_lt(x_val16, y_val16) ? (float16_t){SB_REAL16_ONE} : (float16_t){SB_REAL16_ZERO}; + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + float32_t y_val32 = ((float32_t*)y_bytes)[i]; + ((float32_t*)y_bytes)[i] = f32_lt(x_val32, y_val32) ? (float32_t){SB_REAL32_ONE} : (float32_t){SB_REAL32_ZERO}; + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + float64_t y_val64 = ((float64_t*)y_bytes)[i]; + ((float64_t*)y_bytes)[i] = f64_lt(x_val64, y_val64) ? (float64_t){SB_REAL64_ONE} : (float64_t){SB_REAL64_ZERO}; + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + float128_t y_val128 = ((float128_t*)y_bytes)[i]; + ((float128_t*)y_bytes)[i] = f128M_lt(((float128_t*)&x_val128), ((float128_t*)&y_val128)) ? (float128_t){SB_REAL128L_ONE, SB_REAL128U_ONE} : (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* lte - x > y +*/ + u3_noun + u3qi_la_lte_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + float16_t y_val16 = ((float16_t*)y_bytes)[i]; + ((float16_t*)y_bytes)[i] = f16_le(x_val16, y_val16) ? (float16_t){SB_REAL16_ONE} : (float16_t){SB_REAL16_ZERO}; + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + float32_t y_val32 = ((float32_t*)y_bytes)[i]; + ((float32_t*)y_bytes)[i] = f32_le(x_val32, y_val32) ? (float32_t){SB_REAL32_ONE} : (float32_t){SB_REAL32_ZERO}; + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + float64_t y_val64 = ((float64_t*)y_bytes)[i]; + ((float64_t*)y_bytes)[i] = f64_le(x_val64, y_val64) ? (float64_t){SB_REAL64_ONE} : (float64_t){SB_REAL64_ZERO}; + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + float128_t y_val128 = ((float128_t*)y_bytes)[i]; + ((float128_t*)y_bytes)[i] = f128M_le(((float128_t*)&x_val128), ((float128_t*)&y_val128)) ? (float128_t){SB_REAL128L_ONE, SB_REAL128U_ONE} : (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* adds - axpy = 1*x+[n] +*/ + u3_noun + u3qi_la_adds_i754(u3_noun x_data, + u3_noun n, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + + float16_t n16; + float32_t n32; + float64_t n64; + float128_t n128; + + // Switch on the block size. We assume that n fits in the target block size; Hoon typecheck should prevent. + switch (u3x_atom(bloq)) { + case 4: + u3r_bytes(0, 2, (c3_y*)&(n16.v), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float16_t*)y_bytes)[i] = n16; + } + haxpy(len_x, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + break; + + case 5: + u3r_bytes(0, 4, (c3_y*)&(n32.v), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float32_t*)y_bytes)[i] = n32; + } + saxpy(len_x, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + break; + + case 6: + u3r_bytes(0, 8, (c3_y*)&(n64.v), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float64_t*)y_bytes)[i] = n64; + } + daxpy(len_x, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + break; + + case 7: + u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float128_t*)y_bytes)[i] = (float128_t){n128.v[0], n128.v[1]}; + } + qaxpy(len_x, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + break; + } + + // r_data is the result noun of [data] + y_bytes[syz_x] = 0x1; // pin head + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* subs - axpy = -1*[n]+x +*/ + u3_noun + u3qi_la_subs_i754(u3_noun x_data, + u3_noun n, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/o leading 0x1) + c3_y* y_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + + float16_t n16; + float32_t n32; + float64_t n64; + float128_t n128; + + // Switch on the block size. We assume that n fits in the target block size; Hoon typecheck should prevent. + switch (u3x_atom(bloq)) { + case 4: + u3r_bytes(0, 2, (c3_y*)&(n16.v), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float16_t*)y_bytes)[i] = n16; + } + haxpy(len_x, (float16_t){SB_REAL16_NEGONE}, (float16_t*)y_bytes, 1, (float16_t*)x_bytes, 1); + break; + + case 5: + u3r_bytes(0, 4, (c3_y*)&(n32.v), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float32_t*)y_bytes)[i] = n32; + } + saxpy(len_x, (float32_t){SB_REAL32_NEGONE}, (float32_t*)y_bytes, 1, (float32_t*)x_bytes, 1); + break; + + case 6: + u3r_bytes(0, 8, (c3_y*)&(n64.v), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float64_t*)y_bytes)[i] = n64; + } + daxpy(len_x, (float64_t){SB_REAL64_NEGONE}, (float64_t*)y_bytes, 1, (float64_t*)x_bytes, 1); + break; + + case 7: + u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float128_t*)y_bytes)[i] = (float128_t){n128.v[0], n128.v[1]}; + } + qaxpy(len_x, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)y_bytes, 1, (float128_t*)x_bytes, 1); + break; + } + + // r_data is the result noun of [data] + x_bytes[syz_x] = 0x1; // pin head + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* muls - ?scal n * x + elementwise multiplication +*/ + u3_noun + u3qi_la_muls_i754(u3_noun x_data, + u3_noun n, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + x_bytes[syz_x] = 0x1; // pin head + + float16_t n16; + float32_t n32; + float64_t n64; + float128_t n128; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + u3r_bytes(0, 2, (c3_y*)&(n16.v), n); + hscal(len_x, n16, (float16_t*)x_bytes, 1); + break; + + case 5: + u3r_bytes(0, 4, (c3_y*)&(n32.v), n); + sscal(len_x, n32, (float32_t*)x_bytes, 1); + break; + + case 6: + u3r_bytes(0, 8, (c3_y*)&(n64.v), n); + dscal(len_x, n64, (float64_t*)x_bytes, 1); + break; + + case 7: + u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); + qscal(len_x, n128, (float128_t*)x_bytes, 1); + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* divs - ?scal 1/n * x + elementwise division +*/ + u3_noun + u3qi_la_divs_i754(u3_noun x_data, + u3_noun n, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + x_bytes[syz_x] = 0x1; // pin head + + float16_t in16; + float32_t in32; + float64_t in64; + float128_t in128; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + // XX note that in16 is doing double duty here + u3r_bytes(0, 2, (c3_y*)&(in16.v), n); + in16 = f16_div((float16_t){SB_REAL16_ONE}, in16); + hscal(len_x, in16, (float16_t*)x_bytes, 1); + break; + + case 5: + // XX note that in32 is doing double duty here + u3r_bytes(0, 4, (c3_y*)&(in32.v), n); + in32 = f32_div((float32_t){SB_REAL32_ONE}, in32); + sscal(len_x, in32, (float32_t*)x_bytes, 1); + break; + + case 6: + // XX note that in64 is doing double duty here + u3r_bytes(0, 8, (c3_y*)&(in64.v), n); + in64 = f64_div((float64_t){SB_REAL64_ONE}, in64); + dscal(len_x, in64, (float64_t*)x_bytes, 1); + break; + + case 7: + // XX note that in128 is doing double duty here + u3r_bytes(0, 16, (c3_y*)&(in128.v[0]), n); + f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}), &in128, &in128); + qscal(len_x, in128, (float128_t*)x_bytes, 1); + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* mods - x % [n] = x - r*floor(x/r) + remainder after scalar division +*/ + u3_noun + u3qi_la_mods_i754(u3_noun x_data, + u3_noun n, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + // we reuse it for results for parsimony + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + + float16_t n16, in16; + float32_t n32, in32; + float64_t n64, in64; + float128_t n128, in128; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + u3r_bytes(0, 2, (c3_y*)&(n16.v), n); + in16 = f16_div((float16_t){SB_REAL16_ONE}, n16); + + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + // Perform division x/n + float16_t div_result16 = f16_mul(in16, x_val16); + // Compute floor of the division result + c3_ds floor_result16 = f16_to_i64(div_result16, softfloat_round_minMag, false); + float16_t floor_float16 = i64_to_f16(floor_result16); + // Multiply n by floor(x/n) + float16_t mult_result16 = f16_mul(n16, floor_float16); + // Compute remainder: x - n * floor(x/n) + ((float16_t*)x_bytes)[i] = f16_sub(x_val16, mult_result16); + } + break; + + case 5: + u3r_bytes(0, 4, (c3_y*)&(n32.v), n); + in32 = f32_div((float32_t){SB_REAL32_ONE}, n32); + + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + // Perform division x/n + float32_t div_result32 = f32_mul(in32, x_val32); + // Compute floor of the division result + c3_ds floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false); + float32_t floor_float32 = i64_to_f32(floor_result32); + // Multiply n by floor(x/n) + float32_t mult_result32 = f32_mul(n32, floor_float32); + // Compute remainder: x - n * floor(x/n) + ((float32_t*)x_bytes)[i] = f32_sub(x_val32, mult_result32); + } + break; + + case 6: + u3r_bytes(0, 8, (c3_y*)&(n64.v), n); + in64 = f64_div((float64_t){SB_REAL64_ONE}, n64); + + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + // Perform division x/n + float64_t div_result64 = f64_mul(in64, x_val64); + // Compute floor of the division result + c3_ds floor_result64 = f64_to_i64(div_result64, softfloat_round_minMag, false); + float64_t floor_float64 = i64_to_f64(floor_result64); + // Multiply n by floor(x/n) + float64_t mult_result64 = f64_mul(n64, floor_float64); + // Compute remainder: x - n * floor(x/n) + ((float64_t*)x_bytes)[i] = f64_sub(x_val64, mult_result64); + } + break; + + case 7: + u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); + f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ZERO}), &n128, &in128); + + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + // Perform division x/n + float128_t div_result128; + f128M_mul((float128_t*)&in128, (float128_t*)&x_val128, (float128_t*)&div_result128); + // Compute floor of the division result + c3_ds floor_result128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false); + float128_t floor_float128; + i64_to_f128M(floor_result128, &floor_float128); + // Multiply n by floor(x/n) + float128_t mult_result128; + f128M_mul(((float128_t*)&n128), ((float128_t*)&floor_float128), ((float128_t*)&mult_result128)); + // Compute remainder: x - n * floor(x/n) + f128M_sub(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)x_bytes)[i])); + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* dot - ?dot = x · y +*/ + u3_noun + u3qi_la_dot_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + u3_noun r_data; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: { + float16_t r16[2]; + r16[0] = hdot(len_x, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + r16[1].v = 0x1; + r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)r16); + break;} + + case 5: { + float32_t r32[2]; + r32[0] = sdot(len_x, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + r32[1].v = 0x1; + r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)r32); + break;} + + case 6: { + float64_t r64[2]; + r64[0] = ddot(len_x, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + r64[1].v = 0x1; + r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)r64); + break;} + + case 7: { + float128_t r128[2]; + r128[0] = qdot(len_x, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + r128[1] = (float128_t){0x1, 0x0}; + r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)r128); + break;} + } + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* diag - diag(x) +*/ + u3_noun + u3qi_la_diag(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + // Assert length of dims is 2. + if (u3qb_lent(shape) != 2) { + return u3m_bail(c3__exit); + } + // Unpack shape into an array of dimensions. + c3_d *dims = _get_dims(shape); + if (dims[0] != dims[1]) { + return u3m_bail(c3__exit); + } + + // Unpack the data as a byte array. We assume total length < 2**64. + c3_d len_x = _get_length(shape); + c3_d syz_x = len_x * pow(2, bloq - 3); + c3_d wyd = pow(2, bloq - 3); + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + c3_d syz_y = wyd * dims[1]; + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_y+1)*sizeof(c3_y)); + + u3_noun r_data; + + // Grab the index at i*n_x+j in bytes; put it at j. + for (c3_d i = 0; i < dims[1]; i++) { + // Scan across whole field width. + for (c3_y k = 0; k < wyd; k++) { + y_bytes[i*wyd+k] = x_bytes[(i*dims[0]+i)*wyd+k]; + } + } + y_bytes[syz_y] = 0x1; // pin head + + // Unpack the result back into a noun. + r_data = u3i_bytes((syz_y+1)*sizeof(c3_y), y_bytes); + + u3a_free(x_bytes); + u3a_free(y_bytes); + u3a_free(dims); + + return r_data; + } + +/* transpose - x' +*/ + u3_noun + u3qi_la_transpose(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Assert length of dims is 2. + if (u3qb_lent(shape) != 2) { + return u3m_bail(c3__exit); + } + // Unpack shape into an array of dimensions. + c3_d *dims = _get_dims(shape); + + // Unpack the data as a byte array. We assume total length < 2**64. + c3_d len_x = _get_length(shape); + c3_d syz_x = len_x * pow(2, bloq - 3); + c3_d wyd = pow(2, bloq - 3); + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + + u3_noun r_data; + + // Grab the index at i*n_x+j in bytes; put it at j. + for (c3_d i = 0; i < dims[1]; i++) { + for (c3_d j = 0; j < dims[0]; j++) { + // Scan across whole field width. + for (c3_y k = 0; k < wyd; k++) { + y_bytes[(j*dims[1]+i)*wyd+k] = x_bytes[(i*dims[0]+j)*wyd+k]; + } + } + } + y_bytes[syz_x] = 0x1; // pin head + + // Unpack the result back into a noun. + r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + u3a_free(x_bytes); + u3a_free(y_bytes); + u3a_free(dims); + + return r_data; + } + +/* linspace - [a a+(b-a)/n ... b] +*/ + u3_noun + u3qi_la_linspace_i754(u3_noun a, + u3_noun b, + u3_noun n, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + u3_noun r_data; + + switch (u3x_atom(bloq)) { + case 4: { + float16_t a16, b16; + u3r_bytes(0, 2, (c3_y*)&(a16.v), a); + u3r_bytes(0, 2, (c3_y*)&(b16.v), b); + float16_t span16 = f16_sub(b16, a16); + float16_t interval16 = f16_div(span16, i32_to_f16(n-1)); + c3_y* x_bytes16 = (c3_y*)u3a_malloc((n*2+1)*sizeof(c3_y)); + for (c3_d i = 1; i < n-1; i++) { + ((float16_t*)x_bytes16)[i] = f16_add(a16, f16_mul(i32_to_f16(i), interval16)); + } + // Assign in reverse order so that n=1 case is correctly left-hand bound. + ((float16_t*)x_bytes16)[n-1] = b16; + ((float16_t*)x_bytes16)[0] = a16; + x_bytes16[n*2] = 0x1; // pin head + r_data = u3i_bytes((n*2+1)*sizeof(c3_y), x_bytes16); + u3a_free(x_bytes16); + break;} + + case 5: { + float32_t a32, b32; + u3r_bytes(0, 4, (c3_y*)&(a32.v), a); + u3r_bytes(0, 4, (c3_y*)&(b32.v), b); + float32_t span32 = f32_sub(b32, a32); + float32_t interval32 = f32_div(span32, i32_to_f32(n-1)); + c3_y* x_bytes32 = (c3_y*)u3a_malloc((n*4+1)*sizeof(c3_y)); + for (c3_d i = 1; i < n-1; i++) { + ((float32_t*)x_bytes32)[i] = f32_add(a32, f32_mul(i32_to_f32(i), interval32)); + } + ((float32_t*)x_bytes32)[n-1] = b32; + ((float32_t*)x_bytes32)[0] = a32; + x_bytes32[n*4] = 0x1; // pin head + r_data = u3i_bytes((n*4+1)*sizeof(c3_y), x_bytes32); + u3a_free(x_bytes32); + break;} + + case 6: { + float64_t a64, b64; + u3r_bytes(0, 8, (c3_y*)&(a64.v), a); + u3r_bytes(0, 8, (c3_y*)&(b64.v), b); + float64_t span64 = f64_sub(b64, a64); + float64_t interval64 = f64_div(span64, i32_to_f64(n-1)); + c3_y* x_bytes64 = (c3_y*)u3a_malloc((n*8+1)*sizeof(c3_y)); + for (c3_d i = 1; i < n-1; i++) { + ((float64_t*)x_bytes64)[i] = f64_add(a64, f64_mul(i32_to_f64(i), interval64)); + } + ((float64_t*)x_bytes64)[n-1] = b64; + ((float64_t*)x_bytes64)[0] = a64; + x_bytes64[n*8] = 0x1; // pin head + r_data = u3i_bytes((n*8+1)*sizeof(c3_y), x_bytes64); + u3a_free(x_bytes64); + break;} + + case 7: { + float128_t a128, b128; + u3r_bytes(0, 16, (c3_y*)&(a128.v[0]), a); + u3r_bytes(0, 16, (c3_y*)&(b128.v[0]), b); + float128_t span128; + f128M_sub(&b128, &a128, &span128); + float128_t interval128; + float128_t n128; + i32_to_f128M(n-1, &n128); + f128M_div(&span128, &n128, &interval128); + c3_y* x_bytes128 = (c3_y*)u3a_malloc((n*16+1)*sizeof(c3_y)); + float128_t i128; + for (c3_d i = 1; i < n-1; i++) { + i32_to_f128M(i, &i128); + f128M_mul(&i128, &interval128, &((float128_t*)x_bytes128)[i]); + f128M_add(&a128, &((float128_t*)x_bytes128)[i], &((float128_t*)x_bytes128)[i]); + } + ((float128_t*)x_bytes128)[n-1] = b128; + ((float128_t*)x_bytes128)[0] = a128; + x_bytes128[n*16] = 0x1; // pin head + r_data = u3i_bytes((n*16+1)*sizeof(c3_y), x_bytes128); + u3a_free(x_bytes128); + break;} + } + + return r_data; + } + +/* range - [a a+d ... b] +*/ + u3_noun + u3qi_la_range_i754(u3_noun a, + u3_noun b, + u3_noun d, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + u3_noun r_data; + + switch (u3x_atom(bloq)) { + case 4: { + float16_t a16, b16, interval16; + u3r_bytes(0, 2, (c3_y*)&(a16.v), a); + u3r_bytes(0, 2, (c3_y*)&(b16.v), b); + u3r_bytes(0, 2, (c3_y*)&(interval16.v), d); + c3_d n16 = f16_to_i64(f16_ceil(f16_div(f16_sub(b16, a16), interval16)), softfloat_round_minMag, false); + c3_y* x_bytes16 = (c3_y*)u3a_malloc(((n16+1)*2)*sizeof(c3_y)); + ((float16_t*)x_bytes16)[0] = a16; + for (c3_d i = 1; i < n16; i++) { + ((float16_t*)x_bytes16)[i] = f16_add(a16, f16_mul(i32_to_f16(i), interval16)); + } + ((float16_t*)x_bytes16)[n16].v = 0x1; // pin head + r_data = u3i_bytes(((n16+1)*2)*sizeof(c3_y), x_bytes16); + u3a_free(x_bytes16); + break;} + + case 5: { + float32_t a32, b32, interval32; + u3r_bytes(0, 4, (c3_y*)&(a32.v), a); + u3r_bytes(0, 4, (c3_y*)&(b32.v), b); + u3r_bytes(0, 4, (c3_y*)&(interval32.v), d); + c3_d n32 = f32_to_i64(f32_ceil(f32_div(f32_sub(b32, a32), interval32)), softfloat_round_minMag, false); + c3_y* x_bytes32 = (c3_y*)u3a_malloc(((n32+1)*4)*sizeof(c3_y)); + ((float32_t*)x_bytes32)[0] = a32; + for (c3_d i = 1; i < n32; i++) { + ((float32_t*)x_bytes32)[i] = f32_add(a32, f32_mul(i32_to_f32(i), interval32)); + } + ((float32_t*)x_bytes32)[n32].v = 0x1; // pin head + r_data = u3i_bytes(((n32+1)*4)*sizeof(c3_y), x_bytes32); + u3a_free(x_bytes32); + break;} + + case 6: { + float64_t a64, b64, interval64; + u3r_bytes(0, 8, (c3_y*)&(a64.v), a); + u3r_bytes(0, 8, (c3_y*)&(b64.v), b); + u3r_bytes(0, 8, (c3_y*)&(interval64.v), d); + c3_d n64 = f64_to_i64(f64_ceil(f64_div(f64_sub(b64, a64), interval64)), softfloat_round_minMag, false); + c3_y* x_bytes64 = (c3_y*)u3a_malloc(((n64+1)*8)*sizeof(c3_y)); + ((float64_t*)x_bytes64)[0] = a64; + for (c3_d i = 1; i < n64; i++) { + ((float64_t*)x_bytes64)[i] = f64_add(a64, f64_mul(i32_to_f64(i), interval64)); + } + ((float64_t*)x_bytes64)[n64].v = 0x1; // pin head + r_data = u3i_bytes(((n64+1)*8)*sizeof(c3_y), x_bytes64); + u3a_free(x_bytes64); + break;} + + case 7: { + float128_t a128, b128, interval128; + u3r_bytes(0, 16, (c3_y*)&(a128.v[0]), a); + u3r_bytes(0, 16, (c3_y*)&(b128.v[0]), b); + u3r_bytes(0, 16, (c3_y*)&(interval128.v[0]), d); + float128_t tmp; + f128M_sub(&b128, &a128, &tmp); + f128M_div(&tmp, &interval128, &tmp); + f128M_ceil(&tmp, &tmp); + c3_d n128 = f128M_to_i64(&tmp, softfloat_round_minMag, false); + c3_y* x_bytes128 = (c3_y*)u3a_malloc(((n128+1)*16)*sizeof(c3_y)); + float128_t i128; + ((float128_t*)x_bytes128)[0] = a128; + for (c3_d i = 1; i < n128; i++) { + i32_to_f128M(i, &i128); + f128M_mul(&i128, &interval128, &((float128_t*)x_bytes128)[i]); + f128M_add(&a128, &((float128_t*)x_bytes128)[i], &((float128_t*)x_bytes128)[i]); + } + ((float128_t*)x_bytes128)[n128].v[0] = 0x1; // pin head + ((float128_t*)x_bytes128)[n128].v[1] = 0x0; // pin head + r_data = u3i_bytes(((n128+1)*16)*sizeof(c3_y), x_bytes128); + u3a_free(x_bytes128); + break;} + } + + return r_data; + } + +/* trace - tr(x) +*/ + u3_noun + u3qi_la_trace_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + u3_noun d_data = u3qi_la_diag(x_data, shape, bloq); + c3_d len_x0 = _get_dims(shape)[0]; + u3_noun r_data = u3qi_la_dot_i754(d_data, d_data, u3nt(len_x0, 0x1, u3_nul), u3k(bloq)); + return r_data; + } + +/* mmul +*/ + u3_noun + u3qi_la_mmul_i754(u3_noun x_data, + u3_noun y_data, + u3_noun x_shape, + u3_noun y_shape, + u3_noun bloq) + { + // Unpack the data as a byte array. We assume total length < 2**64. + c3_d M = u3x_atom(u3h(x_shape)); + c3_d Na= u3x_atom(u3h(u3t(x_shape))); + c3_d Nb= u3x_atom(u3h(y_shape)); + c3_d P = u3x_atom(u3h(u3t(y_shape))); + + if ((u3_nul != u3t(u3t(x_shape))) || + (u3_nul != u3t(u3t(y_shape))) || + (Na != Nb)) { + return u3m_bail(c3__exit); + } + c3_d N = Na; + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(x_shape); // M*N + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); // M*N + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // len_x is length in base units + c3_d len_y = _get_length(y_shape); // N*P + + // syz_x is length in bytes + c3_d syz_y = len_y * pow(2, bloq-3); // N*P + + // y_bytes is the data array (w/o leading 0x1) + c3_y* y_bytes = (c3_y*)u3a_malloc(syz_y*sizeof(c3_y)); + u3r_bytes(0, syz_y, y_bytes, y_data); + + // len_r is length in base units + c3_d len_r = M*P; // M*P + + // syz_r is length in bytes + c3_d syz_r = len_r * pow(2, bloq-3); // M*P + + // r_bytes is the result array + c3_y* r_bytes = (c3_y*)u3a_malloc((syz_r+1)*sizeof(c3_y)); + r_bytes[syz_r] = 0x1; // pin head + // initialize with 0x0s + for (c3_d i = 0; i < syz_r; i++) { + r_bytes[i] = 0x0; + } + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + hgemm('N', 'N', M, N, P, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, N, (float16_t*)y_bytes, P, (float16_t){SB_REAL16_ZERO}, (float16_t*)r_bytes, P); + break; + + case 5: + sgemm('N', 'N', M, N, P, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, N, (float32_t*)y_bytes, P, (float32_t){SB_REAL32_ZERO}, (float32_t*)r_bytes, P); + break; + + case 6: + dgemm('N', 'N', M, N, P, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, N, (float64_t*)y_bytes, P, (float64_t){SB_REAL64_ZERO}, (float64_t*)r_bytes, P); + break; + + case 7: + qgemm('N', 'N', M, N, P, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, N, (float128_t*)y_bytes, P, (float128_t){SB_REAL128L_ZERO,SB_REAL128U_ZERO}, (float128_t*)r_bytes, P); + break; + } + + // Unpack the result back into a noun. + u3_noun r_data = u3i_bytes(syz_r+1, r_bytes); + u3_noun M_ = u3i_chub(M); + u3_noun P_ = u3i_chub(P); + + u3a_free(x_bytes); + u3a_free(y_bytes); + u3a_free(r_bytes); + + return u3nc(u3nq(u3nt(M_, P_, u3_nul), u3k(bloq), c3__i754, u3_nul), r_data); + } + + u3_noun + u3wi_la_add(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(rnd) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_add_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_sub(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(rnd) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_sub_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_mul(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(rnd) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_mul_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_div(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(rnd) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_div_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_mod(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(rnd) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_mod_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_cumsum(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_cumsum_i754(x_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3nc(0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_argmin(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_argmin_i754(x_data, x_shape, x_bloq); + // bare atom (@ index) + return r_data;} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_ravel(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_ravel_i754(x_data, x_shape, x_bloq); + // (list @) + return r_data;} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_argmax(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_argmax_i754(x_data, x_shape, x_bloq); + // bare atom (@ index) + return r_data;} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_min(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_min_i754(x_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_max(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_max_i754(x_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_abs(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_abs_i754(x_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_gth(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_gth_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3k(x_meta), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_gte(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_gte_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3k(x_meta), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_lth(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_lth_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3k(x_meta), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_lte(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_lte_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3k(x_meta), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_adds(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_adds_i754(x_data, n, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + + u3_noun + u3wi_la_subs(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_subs_i754(x_data, n, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + + u3_noun + u3wi_la_muls(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_muls_i754(x_data, n, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + + u3_noun + u3wi_la_divs(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_divs_i754(x_data, n, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + + u3_noun + u3wi_la_mods(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_mods_i754(x_data, n, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + + u3_noun + u3wi_la_dot(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_dot_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + c3_d len_x0 = _get_dims(x_shape)[0]; + return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_transpose(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(cor) + ) + { + return u3m_bail(c3__exit); + } else { + u3_noun r_data = u3qi_la_transpose(x_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3nt(u3k(u3h(x_shape)), u3k(u3h(u3t(x_shape))), u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + } + } + } + + u3_noun + u3wi_la_linspace(u3_noun cor) + { + u3_noun x_meta, a, b, n, rnd; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_12, &a, + u3x_sam_13, &b, + u3x_sam_7, &n, + 0)) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(n) || + (n < 1) // crash on zero size + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_linspace_i754(a, b, n, x_bloq); + if (r_data == u3_none) { return u3_none; } + x_shape = u3nc(u3x_atom(n), u3_nul); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_range(u3_noun cor) + { + u3_noun x_meta, a, b, d, rnd; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_12, &a, + u3x_sam_13, &b, + u3x_sam_7, &d, + 0)) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_range_i754(a, b, d, x_bloq); + if (r_data == u3_none) { return u3_none; } + c3_d a_, b_, d_; + c3_ds n_; + switch (x_bloq) { + case 4: + u3r_bytes(0, 2, (c3_y*)&a_, a); + u3r_bytes(0, 2, (c3_y*)&b_, b); + u3r_bytes(0, 2, (c3_y*)&d_, d); + n_ = f16_to_i64(f16_ceil(f16_div(f16_sub((float16_t){b_}, (float16_t){a_}), (float16_t){d_})), softfloat_round_minMag, false) - 1; + break; + case 5: + u3r_bytes(0, 4, (c3_y*)&a_, a); + u3r_bytes(0, 4, (c3_y*)&b_, b); + u3r_bytes(0, 4, (c3_y*)&d_, d); + n_ = f32_to_i64(f32_ceil(f32_div(f32_sub((float32_t){b_}, (float32_t){a_}), (float32_t){d_})), softfloat_round_minMag, false) - 1; + break; + case 6: + u3r_bytes(0, 8, (c3_y*)&a_, a); + u3r_bytes(0, 8, (c3_y*)&b_, b); + u3r_bytes(0, 8, (c3_y*)&d_, d); + n_ = f64_to_i64(f64_ceil(f64_div(f64_sub((float64_t){b_}, (float64_t){a_}), (float64_t){d_})), softfloat_round_minMag, false) - 1; + break; + case 7: { + c3_d a__[2], b__[2], d__[2]; + u3r_bytes(0, 16, (c3_y*)&a__, a); + u3r_bytes(0, 16, (c3_y*)&b__, b); + u3r_bytes(0, 16, (c3_y*)&d__, d); + float128_t tmp; + f128M_sub((float128_t*)&b__, (float128_t*)&a__, &tmp); + f128M_div(&tmp, (float128_t*)&d__, &tmp); + f128M_ceil(&tmp, &tmp); + n_ = f128M_to_i64(&tmp, softfloat_round_minMag, false) - 1; + break;} + } + u3_noun n = u3i_chub(n_+1); + x_shape = u3nc(u3k(n), u3_nul); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_diag(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(cor) + ) + { + return u3m_bail(c3__exit); + } else { + u3_noun r_data = u3qi_la_diag(x_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + c3_d len_x0 = _get_dims(x_shape)[0]; + return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + } + } + } + + u3_noun + u3wi_la_trace(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + if ( c3n == u3r_mean(x_meta, + 2, &x_shape, + 6, &x_bloq, + 14, &x_kind, + 15, &x_tail, + 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_trace_i754(x_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_mmul(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, + y_shape, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + y_shape = u3h(y_meta); // 2 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == _check(u3nc(x_meta, x_data)) || + c3n == _check(u3nc(y_meta, y_data)) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_mmul_i754(x_data, y_data, x_shape, y_shape, x_bloq); + // result is already [meta data] + return r_data; + + default: + return u3_none; + } + } + } + } diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index ae4704a6b6..a12d4766a8 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -255,6 +255,35 @@ u3_noun u3qfp_nepo(u3_noun, u3_noun); u3_noun u3qfp_rake(u3_noun); + u3_noun u3qi_la_add_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_sub_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mul_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_div_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mod_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_adds_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_subs_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_muls_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_divs_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mods_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_dot_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_diag(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_transpose(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_cumsum_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_argmin_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_argmax_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_ravel_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_min_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_max_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_linspace_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_range_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_abs_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_gth_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_gte_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_lth_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_lte_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_trace_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mmul_i754(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); + # define u3qfu_van_fan 28 # define u3qfu_van_rib 58 # define u3qfu_van_vet 59 diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index 4adb6d6a84..53a8cb952d 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2126,8 +2126,77 @@ static u3j_core _139_hex_json_d[] = {} }; +/* /lib jets in non core +*/ +static u3j_harm _139_non__lagoon_add_a[] = {{".2", u3wi_la_add}, {}}; +static u3j_harm _139_non__lagoon_sub_a[] = {{".2", u3wi_la_sub}, {}}; +static u3j_harm _139_non__lagoon_mul_a[] = {{".2", u3wi_la_mul}, {}}; +static u3j_harm _139_non__lagoon_div_a[] = {{".2", u3wi_la_div}, {}}; +static u3j_harm _139_non__lagoon_mod_a[] = {{".2", u3wi_la_mod}, {}}; +static u3j_harm _139_non__lagoon_adds_a[] = {{".2", u3wi_la_adds}, {}}; +static u3j_harm _139_non__lagoon_subs_a[] = {{".2", u3wi_la_subs}, {}}; +static u3j_harm _139_non__lagoon_muls_a[] = {{".2", u3wi_la_muls}, {}}; +static u3j_harm _139_non__lagoon_divs_a[] = {{".2", u3wi_la_divs}, {}}; +static u3j_harm _139_non__lagoon_mods_a[] = {{".2", u3wi_la_mods}, {}}; +static u3j_harm _139_non__lagoon_dot_a[] = {{".2", u3wi_la_dot}, {}}; +static u3j_harm _139_non__lagoon_trans_a[] ={{".2", u3wi_la_transpose}, {}}; +static u3j_harm _139_non__lagoon_cumsum_a[]={{".2", u3wi_la_cumsum}, {}}; +static u3j_harm _139_non__lagoon_argmin_a[]={{".2", u3wi_la_argmin}, {}}; +static u3j_harm _139_non__lagoon_argmax_a[]={{".2", u3wi_la_argmax}, {}}; +static u3j_harm _139_non__lagoon_ravel_a[]={{".2", u3wi_la_ravel}, {}}; +static u3j_harm _139_non__lagoon_min_a[] = {{".2", u3wi_la_min}, {}}; +static u3j_harm _139_non__lagoon_max_a[] = {{".2", u3wi_la_max}, {}}; +static u3j_harm _139_non__lagoon_linspace_a[]={{".2", u3wi_la_linspace}, {}}; +static u3j_harm _139_non__lagoon_range_a[]= {{".2", u3wi_la_range}, {}}; +static u3j_harm _139_non__lagoon_abs_a[] = {{".2", u3wi_la_abs}, {}}; +static u3j_harm _139_non__lagoon_gth_a[] = {{".2", u3wi_la_gth}, {}}; +static u3j_harm _139_non__lagoon_gte_a[] = {{".2", u3wi_la_gte}, {}}; +static u3j_harm _139_non__lagoon_lth_a[] = {{".2", u3wi_la_lth}, {}}; +static u3j_harm _139_non__lagoon_lte_a[] = {{".2", u3wi_la_lte}, {}}; +static u3j_harm _139_non__lagoon_diag_a[] = {{".2", u3wi_la_diag}, {}}; +static u3j_harm _139_non__lagoon_trace_a[]= {{".2", u3wi_la_trace}, {}}; +static u3j_harm _139_non__lagoon_mmul_a[] = {{".2", u3wi_la_mmul}, {}}; +static u3j_core _139_non__la_core_d[] = + { { "add-rays", 7, _139_non__lagoon_add_a, 0, no_hashes }, + { "sub-rays", 7, _139_non__lagoon_sub_a, 0, no_hashes }, + { "mul-rays", 7, _139_non__lagoon_mul_a, 0, no_hashes }, + { "div-rays", 7, _139_non__lagoon_div_a, 0, no_hashes }, + { "mod-rays", 7, _139_non__lagoon_mod_a, 0, no_hashes }, + { "add-scal", 7, _139_non__lagoon_adds_a, 0, no_hashes }, + { "sub-scal", 7, _139_non__lagoon_subs_a, 0, no_hashes }, + { "mul-scal", 7, _139_non__lagoon_muls_a, 0, no_hashes }, + { "div-scal", 7, _139_non__lagoon_divs_a, 0, no_hashes }, + { "mod-scal", 7, _139_non__lagoon_mods_a, 0, no_hashes }, + { "dot", 7, _139_non__lagoon_dot_a, 0, no_hashes }, + { "transpose",7, _139_non__lagoon_trans_a, 0, no_hashes }, + { "cumsum", 7, _139_non__lagoon_cumsum_a, 0, no_hashes }, + { "argmin", 7, _139_non__lagoon_argmin_a, 0, no_hashes }, + { "argmax", 7, _139_non__lagoon_argmax_a, 0, no_hashes }, + { "ravel", 7, _139_non__lagoon_ravel_a, 0, no_hashes }, + { "min", 7, _139_non__lagoon_min_a, 0, no_hashes }, + { "max", 7, _139_non__lagoon_max_a, 0, no_hashes }, + { "linspace", 7, _139_non__lagoon_linspace_a, 0, no_hashes }, + { "range", 7, _139_non__lagoon_range_a, 0, no_hashes }, + { "abs", 7, _139_non__lagoon_abs_a, 0, no_hashes }, + { "gth", 7, _139_non__lagoon_gth_a, 0, no_hashes }, + { "gte", 7, _139_non__lagoon_gte_a, 0, no_hashes }, + { "lth", 7, _139_non__lagoon_lth_a, 0, no_hashes }, + { "lte", 7, _139_non__lagoon_lte_a, 0, no_hashes }, + { "diag", 7, _139_non__lagoon_diag_a, 0, no_hashes }, + { "trace", 7, _139_non__lagoon_trace_a,0, no_hashes }, + { "mmul", 7, _139_non__lagoon_mmul_a, 0, no_hashes }, + {} + }; + +static u3j_core _139_non_d[] = + { { "lagoon", 7, 0, _139_non__la_core_d, no_hashes }, + {} + }; + static u3j_core _139_hex_d[] = -{ { "lore", 63, _140_hex_lore_a, 0, no_hashes }, +{ { "non", 7, 0, _139_non_d, no_hashes }, + + { "lore", 63, _140_hex_lore_a, 0, no_hashes }, { "leer", 63, _140_hex_leer_a, 0, no_hashes }, { "loss", 63, _140_hex_loss_a, 0, no_hashes }, { "lune", 127, _140_hex_lune_a, 0, no_hashes }, diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index 24182154bb..f2353bcd3b 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -350,6 +350,35 @@ **/ u3_noun u3wg_plot_fax(u3_noun); u3_noun u3wg_plot_met(u3_noun); + u3_noun u3wi_la_add(u3_noun); + u3_noun u3wi_la_sub(u3_noun); + u3_noun u3wi_la_mul(u3_noun); + u3_noun u3wi_la_div(u3_noun); + u3_noun u3wi_la_mod(u3_noun); + u3_noun u3wi_la_adds(u3_noun); + u3_noun u3wi_la_subs(u3_noun); + u3_noun u3wi_la_muls(u3_noun); + u3_noun u3wi_la_divs(u3_noun); + u3_noun u3wi_la_mods(u3_noun); + u3_noun u3wi_la_dot(u3_noun); + u3_noun u3wi_la_diag(u3_noun); + u3_noun u3wi_la_transpose(u3_noun); + u3_noun u3wi_la_cumsum(u3_noun); + u3_noun u3wi_la_argmin(u3_noun); + u3_noun u3wi_la_argmax(u3_noun); + u3_noun u3wi_la_ravel(u3_noun); + u3_noun u3wi_la_min(u3_noun); + u3_noun u3wi_la_max(u3_noun); + u3_noun u3wi_la_linspace(u3_noun); + u3_noun u3wi_la_range(u3_noun); + u3_noun u3wi_la_abs(u3_noun); + u3_noun u3wi_la_gth(u3_noun); + u3_noun u3wi_la_gte(u3_noun); + u3_noun u3wi_la_lth(u3_noun); + u3_noun u3wi_la_lte(u3_noun); + + u3_noun u3wi_la_trace(u3_noun); + u3_noun u3wi_la_mmul(u3_noun); #endif /* ifndef U3_JETS_W_H */ diff --git a/pkg/noun/manage.c b/pkg/noun/manage.c index 90b993469c..d167994a46 100644 --- a/pkg/noun/manage.c +++ b/pkg/noun/manage.c @@ -473,15 +473,17 @@ u3m_file(c3_c* pas_c) /* u3m_mark(): mark all nouns in the road. */ -c3_w -u3m_mark(FILE* fil_u) +u3m_quac** +u3m_mark(void) { - c3_w tot_w = 0; - tot_w += u3v_mark(fil_u); - tot_w += u3j_mark(fil_u); - tot_w += u3n_mark(fil_u); - tot_w += u3a_mark_road(fil_u); - return tot_w; + u3m_quac** qua_u = c3_malloc(sizeof(*qua_u) * 5); + qua_u[0] = u3v_mark(); + qua_u[1] = u3j_mark(); + qua_u[2] = u3n_mark(); + qua_u[3] = u3a_mark_road(); + qua_u[4] = NULL; + + return qua_u; } /* _pave_parts(): build internal tables. @@ -1540,7 +1542,7 @@ u3m_grab(u3_noun som, ...) // terminate with u3_none // u3h_free(u3R->cax.har_p); // u3R->cax.har_p = u3h_new(); - u3m_mark(0); + u3m_mark(); { va_list vap; u3_noun tur; diff --git a/pkg/noun/manage.h b/pkg/noun/manage.h index 46a8bcb61e..8e543992b9 100644 --- a/pkg/noun/manage.h +++ b/pkg/noun/manage.h @@ -148,10 +148,19 @@ u3_noun u3m_soft_esc(u3_noun ref, u3_noun sam); + + /* u3m_quac: memory report. + */ + typedef struct _u3m_quac { + c3_c* nam_c; + c3_w siz_w; + struct _u3m_quac** qua_u; + } u3m_quac; + /* u3m_mark(): mark all nouns in the road. */ - c3_w - u3m_mark(FILE* fil_u); + u3m_quac** + u3m_mark(); /* u3m_grab(): garbage-collect the world, plus extra roots. */ diff --git a/pkg/noun/nock.c b/pkg/noun/nock.c index f9814a395f..076717b7ad 100644 --- a/pkg/noun/nock.c +++ b/pkg/noun/nock.c @@ -3058,16 +3058,30 @@ _n_bam(u3_noun kev, void* dat) /* u3n_mark(): mark the bytecode cache for gc. */ -c3_w -u3n_mark(FILE* fil_u) +u3m_quac* +u3n_mark() { - c3_w bam_w = 0, har_w = 0; + u3m_quac** qua_u = c3_malloc(sizeof(*qua_u) * 3); + + qua_u[0] = c3_calloc(sizeof(*qua_u[0])); + qua_u[0]->nam_c = strdup("bytecode programs"); + u3p(u3h_root) har_p = u3R->byc.har_p; - u3h_walk_with(har_p, _n_bam, &bam_w); + u3h_walk_with(har_p, _n_bam, &qua_u[0]->siz_w); + qua_u[0]->siz_w = qua_u[0]->siz_w * 4; + + qua_u[1] = c3_calloc(sizeof(*qua_u[1])); + qua_u[1]->nam_c = strdup("bytecode cache"); + qua_u[1]->siz_w = u3h_mark(har_p) * 4; + + qua_u[2] = NULL; + + u3m_quac* tot_u = c3_malloc(sizeof(*tot_u)); + tot_u->nam_c = strdup("total nock stuff"); + tot_u->siz_w = qua_u[0]->siz_w + qua_u[1]->siz_w; + tot_u->qua_u = qua_u; - bam_w = u3a_maid(fil_u, " bytecode programs", bam_w); - har_w = u3a_maid(fil_u, " bytecode cache", u3h_mark(har_p)); - return u3a_maid(fil_u, "total nock stuff", bam_w + har_w); + return tot_u; } /* u3n_reclaim(): clear ad-hoc persistent caches to reclaim memory. diff --git a/pkg/noun/nock.h b/pkg/noun/nock.h index 3f1830a301..5c34515a02 100644 --- a/pkg/noun/nock.h +++ b/pkg/noun/nock.h @@ -123,8 +123,8 @@ /* u3n_mark(): mark bytecode cache. */ - c3_w - u3n_mark(FILE* fil_u); + u3m_quac* + u3n_mark(); /* u3n_reclaim(): clear ad-hoc persistent caches to reclaim memory. */ diff --git a/pkg/noun/vortex.c b/pkg/noun/vortex.c index cb95c8edc3..0abb61babc 100644 --- a/pkg/noun/vortex.c +++ b/pkg/noun/vortex.c @@ -384,16 +384,33 @@ u3v_sway(u3_noun blu, c3_l tab_l, u3_noun tax) /* u3v_mark(): mark arvo kernel. */ -c3_w -u3v_mark(FILE* fil_u) +u3m_quac* +u3v_mark() { u3v_arvo* arv_u = &(u3H->arv_u); - c3_w tot_w = 0; - tot_w += u3a_maid(fil_u, " kernel", u3a_mark_noun(arv_u->roc)); - tot_w += u3a_maid(fil_u, " date", u3a_mark_noun(arv_u->now)); - tot_w += u3a_maid(fil_u, " wish cache", u3a_mark_noun(arv_u->yot)); - return u3a_maid(fil_u, "total arvo stuff", tot_w); + u3m_quac** qua_u = c3_malloc(sizeof(*qua_u) * 4); + + qua_u[0] = c3_calloc(sizeof(*qua_u[0])); + qua_u[0]->nam_c = strdup("kernel"); + qua_u[0]->siz_w = u3a_mark_noun(arv_u->roc) * 4; + + qua_u[1] = c3_calloc(sizeof(*qua_u[1])); + qua_u[1]->nam_c = strdup("date"); + qua_u[1]->siz_w = u3a_mark_noun(arv_u->now) * 4; + + qua_u[2] = c3_calloc(sizeof(*qua_u[2])); + qua_u[2]->nam_c = strdup("wish cache"); + qua_u[2]->siz_w = u3a_mark_noun(arv_u->yot) * 4; + + qua_u[3] = NULL; + + u3m_quac* tot_u = c3_malloc(sizeof(*tot_u)); + tot_u->nam_c = strdup("total arvo stuff"); + tot_u->siz_w = qua_u[0]->siz_w + qua_u[1]->siz_w + qua_u[2]->siz_w; + tot_u->qua_u = qua_u; + + return tot_u; } /* u3v_reclaim(): clear ad-hoc persistent caches to reclaim memory. @@ -426,4 +443,3 @@ u3v_rewrite_compact(void) arv_u->now = u3a_rewritten_noun(arv_u->now); arv_u->yot = u3a_rewritten_noun(arv_u->yot); } - diff --git a/pkg/noun/vortex.h b/pkg/noun/vortex.h index 9cc56474e0..5772543dcb 100644 --- a/pkg/noun/vortex.h +++ b/pkg/noun/vortex.h @@ -130,8 +130,8 @@ /* u3v_mark(): mark arvo kernel. */ - c3_w - u3v_mark(FILE* fil_u); + u3m_quac* + u3v_mark(); /* u3v_reclaim(): clear ad-hoc persistent caches to reclaim memory. */ diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 63812a4792..094c813af6 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -44,7 +44,8 @@ typedef struct _u3_h2o_serv { typedef struct _u3_preq { struct _u3_hreq* req_u; // originating request (nullable) struct _u3_httd* htd_u; // device backpointer - u3_noun pax; // partial scry path + u3_noun pax; // partial scry path + c3_o las_o; // was scry at now } u3_preq; /* u3_hcon: incoming http connection. @@ -638,7 +639,235 @@ _http_seq_new(u3_hcon* hon_u, h2o_req_t* rec_u) return req_u; } -/* _http_req_dispatch(): dispatch http request to %eyre +static void +_http_cache_respond(u3_hreq* req_u, u3_noun nun); + +static void +_http_scry_respond(u3_hreq* req_u, u3_noun nun); + +typedef struct _byte_range { + c3_z beg_z; + c3_z end_z; +} byte_range; + +/* _chunk_align(): align range to a nearby chunk +*/ +static void +_chunk_align(byte_range* rng_u) +{ + c3_z siz_z = 4194304; // 4MiB + + if ( SIZE_MAX != rng_u->beg_z ) { + if ( rng_u->beg_z > rng_u->end_z ) { + rng_u->beg_z = SIZE_MAX; + rng_u->end_z = SIZE_MAX; + } + else { + // XX an out-of-bounds request could be aligned to in-bounds + // resulting in a 200 or 206 response instead of 416. + // browsers should have the total length from content-range, + // and send reasonable range requests. + // + rng_u->beg_z = (rng_u->beg_z / siz_z) * siz_z; + rng_u->end_z = (rng_u->beg_z + siz_z) - 1; + } + } + else if ( SIZE_MAX != rng_u->end_z ) { + // round up to multiple of siz_z + rng_u->end_z = siz_z * ((rng_u->end_z / siz_z) + 1); + } +} + +/* _parse_range(): get a range from '-' delimited text +*/ +static byte_range +_parse_range(c3_c* txt_c, c3_w len_w) +{ + c3_c* hep_c = memchr(txt_c, '-', len_w); + byte_range rng_u; + rng_u.beg_z = SIZE_MAX; + rng_u.end_z = SIZE_MAX; + + if ( hep_c ) { + rng_u.beg_z = h2o_strtosize(txt_c, hep_c - txt_c); + rng_u.end_z = h2o_strtosize(hep_c + 1, len_w - ((hep_c + 1) - txt_c)); + // strange -> [SIZE_MAX SIZE_MAX] + if ( ((SIZE_MAX == rng_u.beg_z) && (hep_c != txt_c)) + || ((SIZE_MAX == rng_u.end_z) && (len_w - ((hep_c + 1) - txt_c) > 0)) + || ((SIZE_MAX != rng_u.beg_z) && (rng_u.beg_z > rng_u.end_z)) ) + { + rng_u.beg_z = SIZE_MAX; + rng_u.end_z = SIZE_MAX; + } + } + return rng_u; +} + +/* _get_range(): get a _byte_range from headers +*/ +static c3_o +_get_range(h2o_headers_t req_headers, byte_range* rng_u) +{ + rng_u->beg_z = SIZE_MAX; + rng_u->end_z = SIZE_MAX; + + c3_w inx_w = h2o_find_header(&req_headers, H2O_TOKEN_RANGE, -1); + if ( UINT32_MAX == inx_w) { + return c3n; + } + + if ( (req_headers.entries[inx_w].value.len >= 6) + && (0 == memcmp("bytes=", req_headers.entries[inx_w].value.base, 6)) ) + { + byte_range tmp_u = _parse_range(req_headers.entries[inx_w].value.base + 6, + req_headers.entries[inx_w].value.len - 6); + rng_u->beg_z = tmp_u.beg_z; + rng_u->end_z = tmp_u.end_z; + } + + return c3y; +} + +/* _http_scry_cb(): respond and maybe cache scry result +*/ +static void +_http_scry_cb(void* vod_p, u3_noun nun) +{ + u3_preq* peq_u = vod_p; + u3_httd* htd_u = peq_u->htd_u; + u3_hreq* req_u = peq_u->req_u; + u3_hfig* fig_u = &req_u->hon_u->htp_u->htd_u->fig_u; + c3_o auth = _http_req_is_auth(fig_u, req_u->rec_u); + + if ( req_u ) { + u3_assert(u3_rsat_peek == req_u->sat_e); + req_u->peq_u = 0; + _http_scry_respond(req_u, u3k(nun)); + } + + // cache only if peek was not at now, and nun isn't u3_nul + if ( (c3n == peq_u->las_o) + && (u3_nul != nun) ) + { + u3_noun key = u3nc(auth, u3k(peq_u->pax)); + u3h_put(htd_u->nax_p, key, nun); + u3z(key); + } + else { + u3z(nun); + } + + u3z(peq_u->pax); + c3_free(peq_u); +} + +/* _beam: ship desk case spur +*/ +typedef struct _beam { + u3_weak who; + u3_weak des; + u3_weak cas; + u3_weak pur; +} beam; + +/* _free_beam(): free a beam +*/ +static void +_free_beam(beam* bem) +{ + u3z(bem->who); + u3z(bem->des); + u3z(bem->cas); + u3z(bem->pur); +} + +/* _get_beam(): get a _beam from url +*/ +static beam +_get_beam(u3_hreq* req_u, c3_c* txt_c, c3_w len_w) +{ + beam bem; + + // get beak + // + for ( c3_w i_w = 0; i_w < 3; ++i_w ) { + u3_noun* wer; + if ( 0 == i_w ) { + wer = &bem.who; + } + else if ( 1 == i_w ) { + wer = &bem.des; + } + else { + wer = &bem.cas; + } + + // find '//' + if ( (len_w >= 2) + && ('/' == txt_c[0]) + && ('/' == txt_c[1]) ) + { + *wer = u3_nul; + txt_c++; + len_w--; + } + // skip '/' + else if ( (len_w > 0) && ('/' == txt_c[0]) ) { + txt_c++; + len_w--; + } + + // '=' + if ( (len_w > 0) && ('=' == txt_c[0]) ) { + if ( 0 == i_w ) { + u3_http* htp_u = req_u->hon_u->htp_u; + u3_httd* htd_u = htp_u->htd_u; + *wer = u3dc("scot", 'p', u3i_chubs(2, htd_u->car_u.pir_u->who_d)); + } + else if ( 1 == i_w ) { + *wer = c3__base; + } + else { + req_u->peq_u->las_o = c3y; + } + txt_c++; + len_w--; + } + // slice cord + else { + c3_c* nex_c; + c3_c* tis_c = memchr(txt_c, '=', len_w); + c3_c* fas_c = memchr(txt_c, '/', len_w); + + if ( tis_c && fas_c ) { + nex_c = c3_min(tis_c, fas_c); + } + else { + nex_c = ( tis_c ) ? tis_c : fas_c; + } + + if ( !nex_c ) { + *wer = u3_none; + return bem; + } + else { + c3_w dif_w = (c3_p)(nex_c - txt_c); + *wer = u3i_bytes(dif_w, (const c3_y*)txt_c); + txt_c = nex_c; + len_w = len_w - dif_w; + } + } + } + + // get spur + u3_noun tmp = u3dc("rush", u3i_bytes(len_w, (const c3_y*)txt_c), u3v_wish("stap")); + bem.pur = ( u3_nul == tmp ) ? u3_none : u3k(u3t(tmp)); + u3z(tmp); + + return bem; +} + +/* _http_req_dispatch(): dispatch http request */ static void _http_req_dispatch(u3_hreq* req_u, u3_noun req) @@ -649,28 +878,137 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) { u3_http* htp_u = req_u->hon_u->htp_u; u3_httd* htd_u = htp_u->htd_u; - u3_noun wir = _http_req_to_duct(req_u); - u3_noun cad; + c3_c* bas_c = req_u->rec_u->input.path.base; + c3_w len_w = req_u->rec_u->input.path.len; + + // check if base url starts with '/_~_/' + if ( (len_w < 6) + || (0 != memcmp("/_~_/", bas_c, 5)) ) { + // no: inject to arvo + u3_noun wir = _http_req_to_duct(req_u); + u3_noun cad; u3_noun adr = u3nc(c3__ipv4, u3i_words(1, &req_u->hon_u->ipf_w)); // XX loopback automatically secure too? - // + // u3_noun dat = u3nt(htp_u->sec, adr, req); cad = ( c3y == req_u->hon_u->htp_u->lop ) ? u3nc(u3i_string("request-local"), dat) : u3nc(u3i_string("request"), dat); + u3_auto_plan(&htd_u->car_u, u3_ovum_init(0, c3__e, wir, cad)); } + else { + // '/_~_/' found + bas_c = bas_c + 4; // retain '/' after /_~_ + len_w = len_w - 4; + + req_u->peq_u = c3_malloc(sizeof(*req_u->peq_u)); + req_u->peq_u->req_u = req_u; + req_u->peq_u->htd_u = htd_u; + req_u->peq_u->las_o = c3n; + req_u->sat_e = u3_rsat_peek; + req_u->peq_u->pax = u3_nul; + + u3_hfig* fig_u = &req_u->hon_u->htp_u->htd_u->fig_u; + h2o_req_t* rec_u = req_u->rec_u; + + // set gang to [~ ~] or ~ + u3_noun gang; + c3_o auth = _http_req_is_auth(fig_u, rec_u); + if ( auth == c3y ) { + gang = u3nc(u3_nul, u3_nul); + } + else { + gang = u3_nul; + } - u3_auto_plan(&htd_u->car_u, u3_ovum_init(0, c3__e, wir, cad)); + beam bem = _get_beam(req_u, bas_c, len_w); + if ( (u3_none == bem.who) + || (u3_none == bem.des) + || (u3_none == bem.cas) + || (u3_none == bem.pur) ) + { + c3_c* msg_c = "bad request"; + h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); + u3z(gang); + u3z(req_u->peq_u->pax); + _free_beam(&bem); + return; + } + + h2o_headers_t req_headers = req_u->rec_u->headers; + byte_range rng_u; + c3_o rng_o = _get_range(req_headers, &rng_u); + + // prepare spur for eyre range scry + // + u3_noun spur; + if ( c3n == rng_o ) { + // full range: '/range/0//foo' + spur = u3nq(u3i_string("range"), c3_s1('0'), u3_blip, u3k(bem.pur)); + } + else { + _chunk_align(&rng_u); + + u3_atom beg = ( SIZE_MAX == rng_u.beg_z) ? + u3_blip : u3dc("scot", c3__ud, u3i_chub(rng_u.beg_z)); + u3_atom end = ( SIZE_MAX == rng_u.end_z) ? + u3_blip : u3dc("scot", c3__ud, u3i_chub(rng_u.end_z)); + + spur = u3nq(u3i_string("range"), beg, end, u3k(bem.pur)); + } + + // peek or respond from cache + // + if ( c3y == req_u->peq_u->las_o ) { + u3_noun our = u3dc("scot", 'p', u3i_chubs(2, htd_u->car_u.pir_u->who_d)); + if ( our == bem.who ) { + u3_pier_peek_last(htd_u->car_u.pir_u, gang, c3__ex, + u3k(bem.des), spur, req_u->peq_u, _http_scry_cb); + } + else { + c3_c* msg_c = "bad request"; + h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); + u3z(gang); + u3z(spur); + u3z(req_u->peq_u->pax); + } + u3z(our); + } + else { + u3_noun bam = u3nq(u3k(bem.who), u3k(bem.des), u3k(bem.cas), spur); + u3_noun key = u3nc(auth, u3k(bam)); + u3_weak nac = u3h_get(htd_u->nax_p, key); + u3z(key); + + if ( (u3_none == nac) + || ((u3_nul == gang) && (c3y == u3r_at(14, nac))) ) + { + // maybe cache, then serve subsequent range requests from cache + u3z(req_u->peq_u->pax); + req_u->peq_u->pax = u3k(bam); + u3_pier_peek(htd_u->car_u.pir_u, gang, u3nt(0, c3__ex, bam), + req_u->peq_u, _http_scry_cb); + u3z(nac); + } + else { + _http_scry_respond(req_u, nac); + u3z(bam); + u3z(gang); + } + } + _free_beam(&bem); + } } } /* _http_cache_respond(): respond with a simple-payload:http */ static void -_http_cache_respond(u3_hreq* req_u, u3_noun nun) { +_http_cache_respond(u3_hreq* req_u, u3_noun nun) +{ h2o_req_t* rec_u = req_u->rec_u; u3_httd* htd_u = req_u->hon_u->htp_u->htd_u; @@ -694,7 +1032,52 @@ _http_cache_respond(u3_hreq* req_u, u3_noun nun) { } else { u3_noun auth, response_header, data; - u3x_qual(u3k(u3t(u3t(nun))), &auth, 0, &response_header, &data); + u3x_qual(u3t(u3t(nun)), &auth, 0, &response_header, &data); + u3_noun status, headers; + u3x_cell(response_header, &status, &headers); + + // check auth + if ( (c3y == auth) + && (c3n == _http_req_is_auth(&htd_u->fig_u, rec_u)) ) + { + h2o_send_error_403(rec_u, "Unauthorized", "unauthorized", 0); + } + else { + req_u->sat_e = u3_rsat_plan; + _http_start_respond(req_u, u3k(status), u3k(headers), u3k(data), c3y); + } + } + u3z(nun); +} + +/* _http_scry_respond(): respond with a simple-payload:http +*/ +static void +_http_scry_respond(u3_hreq* req_u, u3_noun nun) +{ + h2o_req_t* rec_u = req_u->rec_u; + u3_httd* htd_u = req_u->hon_u->htp_u->htd_u; + + if ( u3_nul == nun ) { + u3_weak req = _http_rec_to_httq(rec_u); + if ( u3_none == req ) { + if ( (u3C.wag_w & u3o_verbose) ) { + u3l_log("strange %.*s request", (c3_i)rec_u->method.len, + rec_u->method.base); + } + c3_c* msg_c = "bad request"; + h2o_send_error_generic(rec_u, 400, msg_c, msg_c, 0); + } + else { + h2o_send_error_500(rec_u, "Internal Server Error", "scry failed", 0); + } + } + else if ( u3_none == u3r_at(7, nun) ) { + h2o_send_error_500(rec_u, "Internal Server Error", "scry failed", 0); + } + else { + u3_noun auth, response_header, data; + u3x_qual(u3t(u3t(nun)), &auth, 0, &response_header, &data); u3_noun status, headers; u3x_cell(response_header, &status, &headers); diff --git a/pkg/vere/io/term.c b/pkg/vere/io/term.c index 2e6e20895c..014f2cc90c 100644 --- a/pkg/vere/io/term.c +++ b/pkg/vere/io/term.c @@ -1603,6 +1603,13 @@ _term_io_talk(u3_auto* car_u) u3_noun wir = u3nt(c3__term, '1', u3_nul); u3_noun cad; + // send born event + // + { + cad = u3nc(c3__born, u3_nul); + _term_ovum_plan(car_u, u3k(wir), cad); + } + // send terminal dimensions // { @@ -1639,6 +1646,17 @@ _reck_orchid(u3_noun fot, u3_noun txt, c3_l* tid_l) } } +/* _term_io_quiz(): handle quiz (query to serf). +*/ +static void +_term_io_quiz(void* vod_p, u3_noun res) +{ + u3_auto* car_u = (u3_auto*)vod_p; + u3_noun wir = u3nt(c3__term, '1', u3_nul); + u3_noun cad = u3k(res); + u3_auto_plan(car_u, u3_ovum_init(0, c3__d, wir, cad)); +} + /* _term_io_kick(): apply effects. */ static c3_o @@ -1727,6 +1745,17 @@ _term_io_kick(u3_auto* car_u, u3_noun wir, u3_noun cad) ret_o = c3y; u3_pier_pack(car_u->pir_u); } break; + + case c3__quac: { + ret_o = c3y; + u3_writ* wit_u = u3_lord_writ_new(u3K.pir_u->god_u); + wit_u->typ_e = u3_writ_quiz; + wit_u->qui_u.ptr_v = car_u; + wit_u->qui_u.quiz_f = _term_io_quiz; + + u3_lord_writ_plan(u3K.pir_u->god_u, wit_u); + + } break; } } } diff --git a/pkg/vere/king.c b/pkg/vere/king.c index d7238cbfee..4f2009bb09 100644 --- a/pkg/vere/king.c +++ b/pkg/vere/king.c @@ -1658,7 +1658,6 @@ u3_king_bail(void) void u3_king_grab(void* vod_p) { - c3_w tot_w = 0; FILE* fil_u; u3_assert( u3R == &(u3H->rod_u) ); @@ -1694,11 +1693,32 @@ u3_king_grab(void* vod_p) } #endif - tot_w += u3m_mark(fil_u); - tot_w += u3_pier_mark(fil_u); + u3m_quac** all_u = c3_malloc(sizeof(*all_u)*6); - u3a_print_memory(fil_u, "total marked", tot_w); - u3a_print_memory(fil_u, "sweep", u3a_sweep()); + u3m_quac** var_u = u3m_mark(); + all_u[0] = var_u[0]; + all_u[1] = var_u[1]; + all_u[2] = var_u[2]; + all_u[3] = var_u[3]; + c3_free(var_u); + + c3_w tot_w = all_u[0]->siz_w + all_u[1]->siz_w + + all_u[2]->siz_w + all_u[3]->siz_w; + + all_u[4] = c3_calloc(sizeof(*all_u[4])); + all_u[4]->nam_c = "total marked"; + all_u[4]->siz_w = tot_w; + + all_u[5] = c3_calloc(sizeof(*all_u[5])); + all_u[5]->nam_c = "sweep"; + all_u[5]->siz_w = u3a_sweep(); + + for ( c3_w i_w = 0; i_w < 6; i_w++ ) { + u3a_print_quac(fil_u, 0, all_u[i_w]); + u3a_quac_free(all_u[i_w]); + } + + c3_free(all_u); #ifdef U3_MEMORY_LOG { diff --git a/pkg/vere/lord.c b/pkg/vere/lord.c index b7c03dfe6f..d7ebf20e8f 100644 --- a/pkg/vere/lord.c +++ b/pkg/vere/lord.c @@ -23,6 +23,7 @@ [%peek mil=@ sam=*] :: gang (each path $%([%once @tas @tas path] [%beam @tas beam])) [%play eve=@ lit=(list ?((pair @da ovum) *))] [%work mil=@ job=(pair @da ovum)] + [%quiz $%([%quac ~])] == :: +plea: from serf to king :: @@ -31,6 +32,7 @@ [%ripe [pro=%1 hon=@ nok=@] eve=@ mug=@] [%slog pri=@ tank] [%flog cord] + [%quiz $%([%quac p=*])] $: %peek $% [%done dat=(unit (cask))] [%bail dud=goof] @@ -532,6 +534,16 @@ _lord_plea_play(u3_lord* god_u, u3_noun dat) u3z(dat); } +/* _lord_plea_quiz(): handle quiz (query to serf). + */ +static void +_lord_plea_quiz(u3_lord* god_u, u3_noun dat) +{ + u3_writ* wit_u = _lord_writ_need(god_u, u3_writ_quiz); + wit_u->qui_u.quiz_f(wit_u->qui_u.ptr_v, dat); + u3z(dat); +} + /* _lord_work_spin(): update spinner if more work is in progress. */ static void @@ -760,15 +772,19 @@ _lord_on_plea(void* ptr_v, c3_d len_d, c3_y* byt_y) case c3__ripe: { _lord_plea_ripe(god_u, u3k(dat)); } break; + + case c3__quiz: { + _lord_plea_quiz(god_u, u3k(dat)); + } break; } u3z(jar); } -/* _lord_writ_new(): allocate a new writ. +/* u3_lord_writ_new(): allocate a new writ. */ -static u3_writ* -_lord_writ_new(u3_lord* god_u) +u3_writ* +u3_lord_writ_new(u3_lord* god_u) { u3_writ* wit_u = c3_calloc(sizeof(*wit_u)); return wit_u; @@ -830,6 +846,10 @@ _lord_writ_make(u3_lord* god_u, u3_writ* wit_u) // msg = u3nt(c3__live, c3__exit, 0); } break; + + case u3_writ_quiz: { + msg = u3nt(c3__quiz, c3__quac, u3_nul); + } break; } return msg; @@ -867,10 +887,10 @@ _lord_writ_send(u3_lord* god_u, u3_writ* wit_u) } } -/* _lord_writ_plan(): enqueue a writ and send. +/* u3_lord_writ_plan(): enqueue a writ and send. */ -static void -_lord_writ_plan(u3_lord* god_u, u3_writ* wit_u) +void +u3_lord_writ_plan(u3_lord* god_u, u3_writ* wit_u) { if ( !god_u->ent_u ) { u3_assert( !god_u->ext_u ); @@ -892,7 +912,7 @@ _lord_writ_plan(u3_lord* god_u, u3_writ* wit_u) void u3_lord_peek(u3_lord* god_u, u3_pico* pic_u) { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_peek; wit_u->pek_u = c3_calloc(sizeof(*wit_u->pek_u)); wit_u->pek_u->ptr_v = pic_u->ptr_v; @@ -923,7 +943,7 @@ u3_lord_peek(u3_lord* god_u, u3_pico* pic_u) // XX cache check, unless last // - _lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); } /* u3_lord_play(): recompute batch. @@ -931,7 +951,7 @@ u3_lord_peek(u3_lord* god_u, u3_pico* pic_u) void u3_lord_play(u3_lord* god_u, u3_info fon_u) { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_play; wit_u->fon_u = fon_u; @@ -939,7 +959,7 @@ u3_lord_play(u3_lord* god_u, u3_info fon_u) // // u3_assert( !pay_u.ent_u->nex_u ); - _lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); } /* u3_lord_work(): attempt work. @@ -947,7 +967,7 @@ u3_lord_play(u3_lord* god_u, u3_info fon_u) void u3_lord_work(u3_lord* god_u, u3_ovum* egg_u, u3_noun job) { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_work; wit_u->wok_u.egg_u = egg_u; wit_u->wok_u.job = job; @@ -961,7 +981,7 @@ u3_lord_work(u3_lord* god_u, u3_ovum* egg_u, u3_noun job) god_u->pin_o = c3y; } - _lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); } /* u3_lord_save(): save a snapshot. @@ -973,9 +993,9 @@ u3_lord_save(u3_lord* god_u) return c3n; } else { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_save; - _lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); return c3y; } } @@ -989,9 +1009,9 @@ u3_lord_cram(u3_lord* god_u) return c3n; } else { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_cram; - _lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); return c3y; } } @@ -1001,9 +1021,9 @@ u3_lord_cram(u3_lord* god_u) void u3_lord_meld(u3_lord* god_u) { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_meld; - _lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); } /* u3_lord_pack(): defragment persistent state. @@ -1011,9 +1031,9 @@ u3_lord_meld(u3_lord* god_u) void u3_lord_pack(u3_lord* god_u) { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_pack; - _lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); } /* u3_lord_exit(): shutdown gracefully. @@ -1021,9 +1041,9 @@ u3_lord_pack(u3_lord* god_u) void u3_lord_exit(u3_lord* god_u) { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_exit; - _lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); // XX set timer, then halt } diff --git a/pkg/vere/main.c b/pkg/vere/main.c index 8549e014ee..cc6c83e7fd 100644 --- a/pkg/vere/main.c +++ b/pkg/vere/main.c @@ -1781,7 +1781,7 @@ _cw_grab(c3_i argc, c3_c* argv[]) u3m_boot(u3_Host.dir_c, (size_t)1 << u3_Host.ops_u.lom_y); u3C.wag_w |= u3o_hashless; - u3_serf_grab(); + u3z(u3_serf_grab(c3y)); u3m_stop(); } diff --git a/pkg/vere/pier.c b/pkg/vere/pier.c index fe35b50135..9be46a7c91 100644 --- a/pkg/vere/pier.c +++ b/pkg/vere/pier.c @@ -585,11 +585,9 @@ _czar_boot_data(c3_c* czar_c, &czar_lyf, &czar_bon, &czar_ack)) && (c3y == u3r_safe_word(czar_glx, czar_glx_w)) && (c3y == u3r_safe_word(czar_ryf, czar_ryf_w)) && - (c3y == u3r_safe_word(czar_lyf, czar_lyf_w)) && - (c3y == u3du(czar_bon)) && - (c3y == u3r_safe_word(u3t(czar_bon), czar_bon_w)) && - (c3y == u3du(czar_ack)) && - (c3y == u3r_safe_word(u3t(czar_ack), czar_ack_w)) ) { + (c3y == u3r_safe_word(czar_lyf, czar_lyf_w)) ) { + if ( c3y == u3du(czar_bon) ) u3r_safe_word(u3t(czar_bon), czar_bon_w); + if ( c3y == u3du(czar_ack) ) u3r_safe_word(u3t(czar_ack), czar_ack_w); ret_o = c3y; } @@ -631,27 +629,32 @@ _boot_scry_cb(void* vod_p, u3_noun nun) &czar_glx_w, &czar_ryf_w, &czar_lyf_w, &czar_bon_w, &czar_ack_w) ) { - u3l_log("boot: peer-state unvailable on czar, cannot protect from double boot"); + u3l_log("boot: peer-state unvailable on czar, cannot protect from double-boot"); _pier_work(wok_u); } else { if ( czar_ryf_w == ryf_w ) { c3_w ack_w = cur_w - 1; if ( czar_ack_w == 0xFFFFFFFF ) { // This codepath should never be hit - u3l_log("boot: message-sink-state unvailable on czar, cannot protect from double boot"); + u3l_log("boot: message-sink-state unvailable on czar, cannot protect from double-boot"); _pier_work(wok_u); } else if ( (czar_ack_w == ack_w) || ((nex_w > cur_w) && (czar_ack_w - 1 == ack_w)) ) { _pier_work(wok_u); } else { - u3l_log("boot: failed: czar last ack: %d, ship last ack: %d", - czar_ack_w, ack_w); + u3l_log("boot: failed: double-boot detected, refusing to boot %s\r\n" + "this pier is an old copy, boot the latest pier or breach\r\n" + "read more: https://docs.urbit.org/glossary/double-boot", + who_c); u3_king_bail(); } } else { // Trying to boot old ship after breach - u3l_log("boot: failed: rift in czar peer-state: %d, current rift: %d", - czar_ryf_w, ryf_w); + u3l_log("boot: failed: double-boot detected, refusing to boot %s\r\n" + "this ship has been breached since its initialization, " + "boot the latest pier or breach again\r\n" + "read more: https://docs.urbit.org/glossary/double-boot", + who_c); u3_king_bail(); } } @@ -676,14 +679,17 @@ _boot_scry_cb(void* vod_p, u3_noun nun) c3_free(czar_c); u3_weak kf_ryf = wok_u->pir_u->ryf; if ( kf_ryf == u3_none ) { - u3l_log("boot: keyfile rift unavailable, cannot protect from double boot"); + u3l_log("boot: keyfile rift unavailable, cannot protect from double-boot"); _pier_work(wok_u); } else if ( kf_ryf > czar_ryf_w ) { // Ship has breached, continue boot _pier_work(wok_u); } else { - u3l_log("boot: failed: rift in czar peer state: %d, keyfile rift: %d", - czar_ryf_w, kf_ryf); + u3l_log("boot: failed: double-boot detected, refusing to boot %s\r\n" + "this ship has already been booted elsewere, " + "boot the existing pier or breach\r\n" + "read more: https://docs.urbit.org/glossary/double-boot", + who_c); u3_king_bail(); } } @@ -692,7 +698,7 @@ _boot_scry_cb(void* vod_p, u3_noun nun) * Boot scry endpoint doesn't exists. Most likely old arvo. * Continue boot and hope for the best. */ - u3l_log("boot: %%boot scry endpoint doesn't exist, cannot protect from double boot"); + u3l_log("boot: %%boot scry endpoint doesn't exist, cannot protect from double-boot"); _pier_work(wok_u); } u3z(nun); u3z(who); diff --git a/pkg/vere/serf.c b/pkg/vere/serf.c index 17450780f1..a3ef017e1e 100644 --- a/pkg/vere/serf.c +++ b/pkg/vere/serf.c @@ -27,12 +27,14 @@ :: next steps: [%peek mil=@ sam=*] :: gang (each path $%([%once @tas @tas path] [beam @tas beam])) [%play eve=@ lit=(list ?((pair @da ovum) *))] [%work mil=@ job=(pair @da ovum)] + [%quiz $%([%quac ~])] == :: +plea: from serf to king :: +$ plea $% [%live ~] [%ripe [pro=%1 hon=@ nok=@] eve=@ mug=@] + [%quiz $%([%quac p=*])] [%slog pri=@ tank] [%flog cord] $: %peek @@ -71,18 +73,70 @@ enum { _serf_fag_vega = 1 << 4 // kernel reset }; +/* _serf_quac: convert a quac to a noun. +*/ +u3_noun +_serf_quac(u3m_quac* mas_u) +{ + u3_noun list = u3_nul; + c3_w i_w = 0; + if ( mas_u->qua_u != NULL ) { + while ( mas_u->qua_u[i_w] != NULL ) { + list = u3nc(_serf_quac(mas_u->qua_u[i_w]), list); + i_w++; + } + } + list = u3kb_flop(list); + + u3_noun mas = u3nt(u3i_string(mas_u->nam_c), u3i_word(mas_u->siz_w), list); + + c3_free(mas_u->nam_c); + c3_free(mas_u->qua_u); + c3_free(mas_u); + + return mas; +} + +/* _serf_quacs: convert an array of quacs to a noun list. +*/ +u3_noun +_serf_quacs(u3m_quac** all_u) +{ + u3_noun list = u3_nul; + c3_w i_w = 0; + while ( all_u[i_w] != NULL ) { + list = u3nc(_serf_quac(all_u[i_w]), list); + i_w++; + } + c3_free(all_u); + return u3kb_flop(list); +} + +/* _serf_print_quacs: print an array of quacs. +*/ +void +_serf_print_quacs(FILE* fil_u, u3m_quac** all_u) +{ + fprintf(fil_u, "\r\n"); + c3_w i_w = 0; + while ( all_u[i_w] != NULL ) { + u3a_print_quac(fil_u, 0, all_u[i_w]); + i_w++; + } +} + /* _serf_grab(): garbage collect, checking for profiling. RETAIN. */ -static void -_serf_grab(u3_noun sac) +static u3_noun +_serf_grab(u3_noun sac, c3_o pri_o) { if ( u3_nul == sac) { if ( u3C.wag_w & (u3o_debug_ram | u3o_check_corrupt) ) { u3m_grab(sac, u3_none); } + return u3_nul; } else { - c3_w tot_w = 0; FILE* fil_u; #ifdef U3_MEMORY_LOG @@ -114,36 +168,77 @@ _serf_grab(u3_noun sac) #endif u3_assert( u3R == &(u3H->rod_u) ); - fprintf(fil_u, "\r\n"); - tot_w += u3a_maid(fil_u, "total userspace", u3a_prof(fil_u, 0, sac)); - tot_w += u3m_mark(fil_u); - tot_w += u3a_maid(fil_u, "space profile", u3a_mark_noun(sac)); + u3m_quac* pro_u = u3a_prof(fil_u, sac); + + if ( NULL == pro_u ) { + fflush(fil_u); + u3z(sac); + return u3_nul; + } else { + u3m_quac** all_u = c3_malloc(sizeof(*all_u) * 11); + all_u[0] = pro_u; + + u3m_quac** var_u = u3m_mark(); + all_u[1] = var_u[0]; + all_u[2] = var_u[1]; + all_u[3] = var_u[2]; + all_u[4] = var_u[3]; + c3_free(var_u); + + c3_w tot_w = all_u[0]->siz_w + all_u[1]->siz_w + all_u[2]->siz_w + + all_u[3]->siz_w + all_u[4]->siz_w; - u3a_print_memory(fil_u, "total marked", tot_w); - u3a_print_memory(fil_u, "free lists", u3a_idle(u3R)); - u3a_print_memory(fil_u, "sweep", u3a_sweep()); + all_u[5] = c3_calloc(sizeof(*all_u[5])); + all_u[5]->nam_c = strdup("space profile"); + all_u[5]->siz_w = u3a_mark_noun(sac) * 4; - fflush(fil_u); + tot_w += all_u[5]->siz_w; + + all_u[6] = c3_calloc(sizeof(*all_u[6])); + all_u[6]->nam_c = strdup("total marked"); + all_u[6]->siz_w = tot_w; + + all_u[7] = c3_calloc(sizeof(*all_u[7])); + all_u[7]->nam_c = strdup("free lists"); + all_u[7]->siz_w = u3a_idle(u3R) * 4; + + all_u[8] = c3_calloc(sizeof(*all_u[8])); + all_u[8]->nam_c = strdup("sweep"); + all_u[8]->siz_w = u3a_sweep() * 4; + + all_u[9] = c3_calloc(sizeof(*all_u[9])); + all_u[9]->nam_c = strdup("loom"); + all_u[9]->siz_w = u3C.wor_i * 4; + + all_u[10] = NULL; + + if ( c3y == pri_o ) { + _serf_print_quacs(fil_u, all_u); + } + fflush(fil_u); #ifdef U3_MEMORY_LOG - { - fclose(fil_u); - } + { + fclose(fil_u); + } #endif - u3z(sac); + u3_noun mas = _serf_quacs( all_u); + u3z(sac); - u3l_log(""); + return mas; + } } } /* u3_serf_grab(): garbage collect. */ -void -u3_serf_grab(void) +u3_noun +u3_serf_grab(c3_o pri_o) { u3_noun sac = u3_nul; + u3_noun res = u3_nul; u3_assert( u3R == &(u3H->rod_u) ); @@ -174,19 +269,31 @@ u3_serf_grab(void) u3z(gon); } - fprintf(stderr, "serf: measuring memory:\r\n"); - if ( u3_nul != sac ) { - _serf_grab(sac); + res = _serf_grab(sac, pri_o); } else { - u3a_print_memory(stderr, "total marked", u3m_mark(stderr)); + fprintf(stderr, "sac is empty\r\n"); + u3m_quac** var_u = u3m_mark(); + + c3_w tot_w = 0; + c3_w i_w = 0; + while ( var_u[i_w] != NULL ) { + tot_w += var_u[i_w]->siz_w; + u3a_quac_free(var_u[i_w]); + i_w++; + } + c3_free(var_u); + + u3a_print_memory(stderr, "total marked", tot_w / 4); u3a_print_memory(stderr, "free lists", u3a_idle(u3R)); u3a_print_memory(stderr, "sweep", u3a_sweep()); fprintf(stderr, "\r\n"); } fflush(stderr); + + return res; } /* u3_serf_post(): update serf state post-writ. @@ -214,7 +321,7 @@ u3_serf_post(u3_serf* sef_u) // XX this runs on replay too, |mass s/b elsewhere // if ( sef_u->fag_w & _serf_fag_mute ) { - _serf_grab(sef_u->sac); + u3z(_serf_grab(sef_u->sac, c3y)); sef_u->sac = u3_nul; } @@ -930,7 +1037,7 @@ u3_serf_live(u3_serf* sef_u, u3_noun com, u3_noun* ret) } u3m_save(); - u3_serf_grab(); + u3_serf_grab(c3y); *ret = u3nc(c3__live, u3_nul); return c3y; @@ -1048,10 +1155,22 @@ u3_serf_writ(u3_serf* sef_u, u3_noun wit, u3_noun* pel) ret_o = c3y; } } break; + case c3__quiz: { + u3z(wit); + u3_noun res = u3_serf_grab(c3n); + if ( u3_none == res ) { + ret_o = c3n; + } else { + *pel = u3nt(c3__quiz, c3__quac, res); + ret_o = c3y; + } + } break; } } - u3z(wit); + if ( tag != c3__quiz ) { + u3z(wit); + } return ret_o; } diff --git a/pkg/vere/serf.h b/pkg/vere/serf.h index 7cd2ca47d0..0645434015 100644 --- a/pkg/vere/serf.h +++ b/pkg/vere/serf.h @@ -56,7 +56,8 @@ /* u3_serf_grab(): garbage collect. */ - void - u3_serf_grab(void); + u3_noun + u3_serf_grab(c3_o pri_o); + #endif /* ifndef U3_VERE_SERF_H */ diff --git a/pkg/vere/vere.h b/pkg/vere/vere.h index 1708f58347..2363cc02b4 100644 --- a/pkg/vere/vere.h +++ b/pkg/vere/vere.h @@ -460,7 +460,8 @@ u3_writ_cram = 4, u3_writ_meld = 5, u3_writ_pack = 6, - u3_writ_exit = 7 + u3_writ_exit = 7, + u3_writ_quiz = 8 } u3_writ_type; /* u3_writ: ipc message from king to serf @@ -476,6 +477,10 @@ u3_peek* pek_u; // peek u3_info fon_u; // recompute c3_d eve_d; // save/pack at + struct { // serf query: + void* ptr_v; // driver + void (*quiz_f)(void*, u3_noun); // callback + } qui_u; // }; } u3_writ; @@ -771,7 +776,16 @@ u3_atom u3_time_t_in_ts(time_t tim); #endif + /* u3_lord_writ_new(): allocate a new writ. + */ + u3_writ* + u3_lord_writ_new(u3_lord* god_u); + /* u3_lord_writ_plan(): enqueue a writ and send. + */ + void + u3_lord_writ_plan(u3_lord* god_u, u3_writ* wit_u); + /* u3_time_out_ts(): struct timespec from urbit time. */ void