From e5690487f2ae0f02648083acb14ff4e98cc0fd87 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 28 Feb 2024 13:25:07 -0600 Subject: [PATCH 01/97] WIP bazel --- WORKSPACE.bazel | 12 +++++- bazel/third_party/softblas/BUILD.bazel | 0 bazel/third_party/softblas/softblas.BUILD | 52 +++++++++++++++++++++++ pkg/noun/BUILD.bazel | 1 + pkg/noun/jets/q.h | 2 + pkg/noun/jets/tree.c | 7 +++ pkg/noun/jets/w.h | 2 + 7 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 bazel/third_party/softblas/BUILD.bazel create mode 100644 bazel/third_party/softblas/softblas.BUILD diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index d3c4848fd1..5553f2357c 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -291,6 +291,14 @@ versioned_http_archive( version = "2.14", ) +versioned_http_archive( + name = "softblas", + build_file = "//bazel/third_party/softblas:softblas.BUILD", + # sha256 = "", + url = "https://github.com/urbit/SoftBLAS/archive/{version}.tar.gz", + version = "bd637fdb23ac4ebd4048eb546633262ddf647b18", +) + versioned_http_archive( name = "softfloat", build_file = "//bazel/third_party/softfloat:softfloat.BUILD", @@ -354,10 +362,10 @@ versioned_http_archive( versioned_http_archive( name = "zlib", build_file = "//bazel/third_party/zlib:zlib.BUILD", - sha256 = "ff0ba4c292013dbc27530b3a81e1f9a813cd39de01ca5e0f8bf355702efa593e", + sha256 = "9a93b2b7dfdac77ceba5a558a580e74667dd6fede4585b91eefb60f03b72df23", strip_prefix = "zlib-{version}", url = "https://www.zlib.net/zlib-{version}.tar.gz", - version = "1.3", + version = "1.3.1", ) # diff --git a/bazel/third_party/softblas/BUILD.bazel b/bazel/third_party/softblas/BUILD.bazel new file mode 100644 index 0000000000..e69de29bb2 diff --git a/bazel/third_party/softblas/softblas.BUILD b/bazel/third_party/softblas/softblas.BUILD new file mode 100644 index 0000000000..8d669a7f77 --- /dev/null +++ b/bazel/third_party/softblas/softblas.BUILD @@ -0,0 +1,52 @@ +# FILEPATH: /home/neal/lagoon/vere/bazel/third_party/softblas/softblas.BUILD + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +cc_library( + name = "softblas", + visibility = ["//visibility:public"], + hdrs = ["include/softblas.h"], + # includes = ["include"], + srcs = ["src/blas/level1/sasum.c", + "src/blas/level1/dasum.c", + "src/blas/level1/hasum.c", + "src/blas/level1/qasum.c", + "src/blas/level1/saxpy.c", + "src/blas/level1/daxpy.c", + "src/blas/level1/haxpy.c", + "src/blas/level1/qaxpy.c", + "src/blas/level1/scopy.c", + "src/blas/level1/dcopy.c", + "src/blas/level1/hcopy.c", + "src/blas/level1/qcopy.c", + "src/blas/level1/sdot.c", + "src/blas/level1/ddot.c", + "src/blas/level1/hdot.c", + "src/blas/level1/qdot.c", + "src/blas/level1/snrm2.c", + "src/blas/level1/dnrm2.c", + "src/blas/level1/hnrm2.c", + "src/blas/level1/qnrm2.c", + "src/blas/level1/sscal.c", + "src/blas/level1/dscal.c", + "src/blas/level1/hscal.c", + "src/blas/level1/qscal.c", + "src/blas/level1/sswap.c", + "src/blas/level1/dswap.c", + "src/blas/level1/hswap.c", + "src/blas/level1/qswap.c", + "src/blas/level1/isamax.c", + "src/blas/level1/idamax.c", + "src/blas/level1/ihamax.c", + "src/blas/level1/iqamax.c", + "src/blas/level2/sgemv.c", + "src/blas/level2/dgemv.c", + "src/blas/level2/hgemv.c", + "src/blas/level2/qgemv.c", + "src/blas/level3/sgemm.c", + "src/blas/level3/dgemm.c", + "src/blas/level3/hgemm.c", + "src/blas/level3/qgemm.c" + ], + deps = ["@softfloat"], +) diff --git a/pkg/noun/BUILD.bazel b/pkg/noun/BUILD.bazel index a6b8de6d7a..1e16d21573 100644 --- a/pkg/noun/BUILD.bazel +++ b/pkg/noun/BUILD.bazel @@ -38,6 +38,7 @@ vere_library( "@openssl", "@pdjson", "@sigsegv", + "@softblas", "@softfloat", "@urcrypt", ] + select({ diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index 33c63ef42b..8b99103e56 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -247,6 +247,8 @@ u3_noun u3qfp_nepo(u3_noun, u3_noun); u3_noun u3qfp_rake(u3_noun); + u3_noun u3qf_la_add_real(u3_noun, u3_noun, u3_noun); + # define u3qfu_van_fan 28 # define u3qfu_van_rib 58 # define u3qfu_van_vet 59 diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index b0264327d1..0499a5202d 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2147,6 +2147,11 @@ static u3j_core _139_hex_json_d[] = {} }; +/* linear algebra jets +*/ + +static u3j_harm _139_hex_lagoon_add_a[] = {{".2", u3wf_la_add}, {}}; + static u3j_core _139_hex_d[] = { { "lore", 63, _140_hex_lore_a, 0, no_hashes }, { "leer", 63, _140_hex_leer_a, 0, no_hashes }, @@ -2165,6 +2170,8 @@ static u3j_core _139_hex_d[] = { "secp", 6, 0, _140_hex_secp_d, no_hashes }, { "mimes", 31, 0, _140_hex_mimes_d, no_hashes }, { "json", 31, 0, _139_hex_json_d, no_hashes }, + + { "add", 7, _139_hex_lagoon_add_a, 0, no_hashes }, {} }; diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index d838416c03..5cc40f1975 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -331,4 +331,6 @@ u3_noun u3wfu_repo(u3_noun); u3_noun u3wfu_rest(u3_noun); + u3_noun u3wf_la_add(u3_noun); + #endif /* ifndef U3_JETS_W_H */ From 3cf0ad60ffb2a15f965e7819deaacb580447076b Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Wed, 28 Feb 2024 14:37:22 -0500 Subject: [PATCH 02/97] WIP bazel builds but `qgemm.c` needs `#include ` --- WORKSPACE.bazel | 1 + bazel/third_party/softblas/softblas.BUILD | 84 ++++++++++++----------- 2 files changed, 44 insertions(+), 41 deletions(-) diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index 5553f2357c..ddb83b8361 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -294,6 +294,7 @@ versioned_http_archive( versioned_http_archive( name = "softblas", build_file = "//bazel/third_party/softblas:softblas.BUILD", + strip_prefix = "SoftBLAS-{version}", # sha256 = "", url = "https://github.com/urbit/SoftBLAS/archive/{version}.tar.gz", version = "bd637fdb23ac4ebd4048eb546633262ddf647b18", diff --git a/bazel/third_party/softblas/softblas.BUILD b/bazel/third_party/softblas/softblas.BUILD index 8d669a7f77..2ca6b46b1a 100644 --- a/bazel/third_party/softblas/softblas.BUILD +++ b/bazel/third_party/softblas/softblas.BUILD @@ -6,47 +6,49 @@ cc_library( name = "softblas", visibility = ["//visibility:public"], hdrs = ["include/softblas.h"], - # includes = ["include"], - srcs = ["src/blas/level1/sasum.c", - "src/blas/level1/dasum.c", - "src/blas/level1/hasum.c", - "src/blas/level1/qasum.c", - "src/blas/level1/saxpy.c", - "src/blas/level1/daxpy.c", - "src/blas/level1/haxpy.c", - "src/blas/level1/qaxpy.c", - "src/blas/level1/scopy.c", - "src/blas/level1/dcopy.c", - "src/blas/level1/hcopy.c", - "src/blas/level1/qcopy.c", - "src/blas/level1/sdot.c", - "src/blas/level1/ddot.c", - "src/blas/level1/hdot.c", - "src/blas/level1/qdot.c", - "src/blas/level1/snrm2.c", - "src/blas/level1/dnrm2.c", - "src/blas/level1/hnrm2.c", - "src/blas/level1/qnrm2.c", - "src/blas/level1/sscal.c", - "src/blas/level1/dscal.c", - "src/blas/level1/hscal.c", - "src/blas/level1/qscal.c", - "src/blas/level1/sswap.c", - "src/blas/level1/dswap.c", - "src/blas/level1/hswap.c", - "src/blas/level1/qswap.c", - "src/blas/level1/isamax.c", - "src/blas/level1/idamax.c", - "src/blas/level1/ihamax.c", - "src/blas/level1/iqamax.c", - "src/blas/level2/sgemv.c", - "src/blas/level2/dgemv.c", - "src/blas/level2/hgemv.c", - "src/blas/level2/qgemv.c", - "src/blas/level3/sgemm.c", - "src/blas/level3/dgemm.c", - "src/blas/level3/hgemm.c", - "src/blas/level3/qgemm.c" + includes = ["include"], + srcs = [ + "include/softblas.h", + "src/blas/level1/sasum.c", + "src/blas/level1/dasum.c", + "src/blas/level1/hasum.c", + "src/blas/level1/qasum.c", + "src/blas/level1/saxpy.c", + "src/blas/level1/daxpy.c", + "src/blas/level1/haxpy.c", + "src/blas/level1/qaxpy.c", + "src/blas/level1/scopy.c", + "src/blas/level1/dcopy.c", + "src/blas/level1/hcopy.c", + "src/blas/level1/qcopy.c", + "src/blas/level1/sdot.c", + "src/blas/level1/ddot.c", + "src/blas/level1/hdot.c", + "src/blas/level1/qdot.c", + "src/blas/level1/snrm2.c", + "src/blas/level1/dnrm2.c", + "src/blas/level1/hnrm2.c", + "src/blas/level1/qnrm2.c", + "src/blas/level1/sscal.c", + "src/blas/level1/dscal.c", + "src/blas/level1/hscal.c", + "src/blas/level1/qscal.c", + "src/blas/level1/sswap.c", + "src/blas/level1/dswap.c", + "src/blas/level1/hswap.c", + "src/blas/level1/qswap.c", + "src/blas/level1/isamax.c", + "src/blas/level1/idamax.c", + "src/blas/level1/ihamax.c", + "src/blas/level1/iqamax.c", + "src/blas/level2/sgemv.c", + "src/blas/level2/dgemv.c", + "src/blas/level2/hgemv.c", + "src/blas/level2/qgemv.c", + "src/blas/level3/sgemm.c", + "src/blas/level3/dgemm.c", + "src/blas/level3/hgemm.c", + "src/blas/level3/qgemm.c" ], deps = ["@softfloat"], ) From 1beb9c35dd9a0d77daac95a5e1ad2b5c02ecb997 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 28 Feb 2024 13:49:18 -0600 Subject: [PATCH 03/97] Include new commit hash. --- WORKSPACE.bazel | 2 +- bazel/third_party/softblas/softblas.BUILD | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index 5553f2357c..3938734312 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -296,7 +296,7 @@ versioned_http_archive( build_file = "//bazel/third_party/softblas:softblas.BUILD", # sha256 = "", url = "https://github.com/urbit/SoftBLAS/archive/{version}.tar.gz", - version = "bd637fdb23ac4ebd4048eb546633262ddf647b18", + version = "cbf3dfff5882fd03f28a74c7c0c6ef4c27ec176d", ) versioned_http_archive( diff --git a/bazel/third_party/softblas/softblas.BUILD b/bazel/third_party/softblas/softblas.BUILD index 8d669a7f77..101487d86a 100644 --- a/bazel/third_party/softblas/softblas.BUILD +++ b/bazel/third_party/softblas/softblas.BUILD @@ -6,7 +6,7 @@ cc_library( name = "softblas", visibility = ["//visibility:public"], hdrs = ["include/softblas.h"], - # includes = ["include"], + includes = ["include"], srcs = ["src/blas/level1/sasum.c", "src/blas/level1/dasum.c", "src/blas/level1/hasum.c", From e91829d00b15de10c582696548bd2dc5968b24b6 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Thu, 29 Feb 2024 10:30:41 -0600 Subject: [PATCH 04/97] Post jet fundamentals. --- pkg/c3/motes.h | 6 ++ pkg/noun/jets/f/lagoon.c | 157 +++++++++++++++++++++++++++++++++++++++ pkg/noun/jets/q.h | 2 +- pkg/noun/jets/tree.c | 10 ++- 4 files changed, 171 insertions(+), 4 deletions(-) create mode 100644 pkg/noun/jets/f/lagoon.c diff --git a/pkg/c3/motes.h b/pkg/c3/motes.h index db17834057..3cbce8c48c 100644 --- a/pkg/c3/motes.h +++ b/pkg/c3/motes.h @@ -258,6 +258,7 @@ # define c3__corp c3_s4('c','o','r','p') # define c3__corp c3_s4('c','o','r','p') # define c3__cow c3_s3('c','o','w') +# define c3__cplx c3_s3('c','p','l','x') # define c3__cpu c3_s3('c','p','u') # define c3__crad c3_s4('c','r','a','d') # define c3__cram c3_s4('c','r','a','m') @@ -430,6 +431,7 @@ # define c3__fit c3_s3('f','i','t') # define c3__fits c3_s4('f','i','t','s') # define c3__fix c3_s3('f','i','x') +# define c3__fixp c3_s3('f','i','x','p') # define c3__fl c3_s2('f','l') # define c3__flac c3_s4('f','l','a','c') # define c3__flag c3_s4('f','l','a','g') @@ -602,6 +604,7 @@ # define c3__info c3_s4('i','n','f','o') # define c3__init c3_s4('i','n','i','t') # define c3__ins c3_s3('i','n','s') +# define c3__int2 c3_s4('i','n','t','2') # define c3__into c3_s4('i','n','t','o') # define c3__intr c3_s4('i','n','t','r') # define c3__inuk c3_s4('i','n','u','k') @@ -970,6 +973,7 @@ # define c3__rasp c3_s4('r','a','s','p') # define c3__raw c3_s3('r','a','w') # define c3__read c3_s4('r','e','a','d') +# define c3__real c3_s4('r','e','a','l') # define c3__reck c3_s4('r','e','c','k') # define c3__reef c3_s4('r','e','e','f') # define c3__resd c3_s4('r','e','s','d') @@ -1229,11 +1233,13 @@ # define c3__ubin c3_s4('u','b','i','n') # define c3__ubit c3_s4('u','b','i','t') # define c3__ud c3_s2('u','d') +# define c3__uint c3_s4('u','i','n','t') # define c3__ulib c3_s4('u','l','i','b') # define c3__un c3_s2('u','n') # define c3__uniq c3_s4('u','n','i','q') # define c3__unix c3_s4('u','n','i','x') # define c3__unt c3_s3('u','n','t') +# define c3__unum c3_s3('u','n','u','m') # define c3__up c3_s2('u','p') # define c3__url c3_s3('u','r','l') # define c3__urth c3_s4('u','r','t','h') diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c new file mode 100644 index 0000000000..e45b1ce124 --- /dev/null +++ b/pkg/noun/jets/f/lagoon.c @@ -0,0 +1,157 @@ +/// @file + +#include "jets/q.h" +#include "jets/w.h" + +#include "noun.h" +#include "softfloat.h" +#include "softblas.h" + +#include + + union half { + float16_t h; + c3_w c; + }; + + union sing { + float32_t s; + c3_w c; + }; + + union doub { + float64_t d; + c3_d c; + }; + + union quad { + float128_t q; + c3_d c[2]; + }; + + static inline void + _set_rounding(c3_w a) + { + switch ( a ) + { + default: + u3m_bail(c3__fail); + break; + case c3__n: + softfloat_roundingMode = softfloat_round_near_even; + break; + case c3__z: + softfloat_roundingMode = softfloat_round_minMag; + break; + case c3__u: + softfloat_roundingMode = softfloat_round_max; + break; + case c3__d: + softfloat_roundingMode = softfloat_round_min; + break; + } + } + +/* add +*/ + u3_noun + u3qf_la_add_real(u3_noun a_data, + u3_noun b_data, + u3_noun shape, + u3_noun bloq, + u3_noun rnd) + { + + fprintf(stderr, ">> u3qf_la_add_real\n"); + + // SoftBLAS needs to be used here. + return u3_none; + + // // Split a into component atoms. + // // (roll shape mul) => 2 x 3 = 6 + // c3_w size = 1; + // u3_atom shp = shape; + // while (u3_nul != shp) { + // shp = u3t(shp); + // size *= shp; + // } + + + + + // return u3i_word(len_w); + + + // union sing c, d, e; + // _set_rounding(r); + // c.c = u3r_word(0, a); + // d.c = u3r_word(0, b); + // e.s = _nan_unify_s(f32_add(c.s, d.s)); + + // return u3i_words(1, &e.c); + } + + u3_noun + u3wf_la_add(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun a_meta, a_data, + b_meta, b_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &a_meta, + u3x_sam_5, &a_data, + u3x_sam_6, &b_meta, + u3x_sam_7, &b_data, + 0) || + c3n == u3ud(a_data) || + c3n == u3ud(b_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun a_shape, a_bloq, a_kind, + b_shape, b_bloq, b_kind, + rnd, fxp; + if ( c3n == u3r_mean(a_meta, + 2, &a_shape, + 6, &a_bloq, + 7, &a_kind, + 0) || + c3n == u3r_mean(b_meta, + 2, &b_shape, + 6, &b_bloq, + 7, &b_kind, + 0) || + c3n == u3r_sing(a_shape, b_shape) || + c3n == u3r_sing(a_bloq, b_bloq) || + c3n == u3r_sing(a_kind, b_kind) || + c3n == u3r_mean(cor, 60, &rnd, 61, &fxp, 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (a_kind) { + case c3__real: + return u3qf_la_add_real(a_data, b_data, a_shape, a_bloq, rnd); + + // case c3__int2: + // return u3qf_la_add_int2(a_data, b_data, a_shape, a_bloq); + + // case c3__uint: + // return u3qf_la_add_uint(a_data, b_data, a_shape, a_bloq); + + // case c3__cplx: + // return u3qf_la_add_cplx(a_data, b_data, a_shape, a_bloq, rnd); + + // case c3__unum: + // return u3qf_la_add_unum(a_data, b_data, a_shape, a_bloq); + + // case c3__fixp: + // return u3qf_la_add_fixp(a_data, b_data, a_shape, a_bloq); + + default: + return u3_none; + } + } + } + } diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index 8b99103e56..c7cb5ccfd0 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -247,7 +247,7 @@ u3_noun u3qfp_nepo(u3_noun, u3_noun); u3_noun u3qfp_rake(u3_noun); - u3_noun u3qf_la_add_real(u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_add_real(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); # define u3qfu_van_fan 28 # define u3qfu_van_rib 58 diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index 0499a5202d..375f0e1610 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2148,9 +2148,13 @@ static u3j_core _139_hex_json_d[] = }; /* linear algebra jets + XX move to outer _hep_ core for /lib? */ - -static u3j_harm _139_hex_lagoon_add_a[] = {{".2", u3wf_la_add}, {}}; +static u3j_harm _139_hex__lagoon_add_a[] = {{".2", u3wf_la_add}, {}}; +static u3j_core _139_hex__lagoon_d[] = + { { "add-rays", 7, _139_hex__lagoon_add_a, 0, no_hashes }, + {} + }; static u3j_core _139_hex_d[] = { { "lore", 63, _140_hex_lore_a, 0, no_hashes }, @@ -2171,7 +2175,7 @@ static u3j_core _139_hex_d[] = { "mimes", 31, 0, _140_hex_mimes_d, no_hashes }, { "json", 31, 0, _139_hex_json_d, no_hashes }, - { "add", 7, _139_hex_lagoon_add_a, 0, no_hashes }, + { "lagoon", 31, 0, _139_hex__lagoon_d, no_hashes }, {} }; From 4be17fc8b90eb7c6b68f4510b3541f50f2806647 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Thu, 29 Feb 2024 13:11:05 -0600 Subject: [PATCH 05/97] Hints for Lagoon work. --- pkg/noun/jets/f/lagoon.c | 29 +++++++++++++++++------------ pkg/noun/jets/tree.c | 10 ++++++++-- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index e45b1ce124..5b27ccb31e 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -97,6 +97,7 @@ // Each argument is a ray, [=meta data=@ux] u3_noun a_meta, a_data, b_meta, b_data; + fprintf(stderr, "\n>> u3wf_la_add\n"); if ( c3n == u3r_mean(cor, u3x_sam_4, &a_meta, @@ -109,27 +110,31 @@ { return u3m_bail(c3__exit); } else { - u3_noun a_shape, a_bloq, a_kind, - b_shape, b_bloq, b_kind, - rnd, fxp; + u3_noun a_shape, a_bloq, a_kind, a_fxp, + b_shape, b_bloq, b_kind, b_fxp, + rnd; if ( c3n == u3r_mean(a_meta, - 2, &a_shape, - 6, &a_bloq, - 7, &a_kind, - 0) || + 2, &a_shape, + 6, &a_bloq, + 14, &a_kind, + 15, &a_fxp, + 0) || c3n == u3r_mean(b_meta, - 2, &b_shape, - 6, &b_bloq, - 7, &b_kind, - 0) || + 2, &b_shape, + 6, &b_bloq, + 14, &b_kind, + 15, &b_fxp, + 0) || c3n == u3r_sing(a_shape, b_shape) || c3n == u3r_sing(a_bloq, b_bloq) || c3n == u3r_sing(a_kind, b_kind) || - c3n == u3r_mean(cor, 60, &rnd, 61, &fxp, 0) + // fxp does not need to match so no check + c3n == u3r_mean(cor, 31, &rnd, 0) ) { return u3m_bail(c3__exit); } else { + fprintf(stderr, ">> u3wf_la_add: a_kind: %x\n", a_kind); switch (a_kind) { case c3__real: return u3qf_la_add_real(a_data, b_data, a_shape, a_bloq, rnd); diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index 375f0e1610..b6597ffa7f 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2148,14 +2148,20 @@ static u3j_core _139_hex_json_d[] = }; /* linear algebra jets - XX move to outer _hep_ core for /lib? + XX move to outer _sep_ core for /lib? eventually +static u3j_core _139_sep_d[] = */ static u3j_harm _139_hex__lagoon_add_a[] = {{".2", u3wf_la_add}, {}}; -static u3j_core _139_hex__lagoon_d[] = +static u3j_core _139_hex__la_core_d[] = { { "add-rays", 7, _139_hex__lagoon_add_a, 0, no_hashes }, {} }; +static u3j_core _139_hex__lagoon_d[] = + { { "la-core", 7, 0, _139_hex__la_core_d, no_hashes }, + {} + }; + static u3j_core _139_hex_d[] = { { "lore", 63, _140_hex_lore_a, 0, no_hashes }, { "leer", 63, _140_hex_leer_a, 0, no_hashes }, From 81718e2d75e4e49471be12ae697ca10fe574878e Mon Sep 17 00:00:00 2001 From: Sigilante Date: Thu, 29 Feb 2024 14:04:23 -0600 Subject: [PATCH 06/97] ++add for Lagoon working. --- pkg/noun/jets/f/lagoon.c | 89 +++++++++++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 19 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index 5b27ccb31e..3aa27e63d3 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -52,6 +52,18 @@ } } +/* shape +*/ + static inline uint64_t _get_shape(u3_noun shape) + { + uint64_t res = 1; + while (u3_nul != shape) { + res = res * u3h(shape); + shape = u3t(shape); + } + return res; + } + /* add */ u3_noun @@ -64,31 +76,72 @@ fprintf(stderr, ">> u3qf_la_add_real\n"); - // SoftBLAS needs to be used here. - return u3_none; + // Unpack the data as a byte array for SoftBLAS. + uint64_t len_a = _get_shape(shape); + uint8_t* a_bytes = (uint8_t*)malloc(len_a*sizeof(uint8_t)); + u3r_bytes(0, len_a, a_bytes, a_data); + uint8_t* b_bytes = (uint8_t*)malloc(len_a*sizeof(uint8_t)); + u3r_bytes(0, len_a, b_bytes, b_data); + + u3_noun r_data; + + // Switch on the block size. + switch (bloq) { + case 4: + haxpy(len_a, (float16_t){SB_REAL16_ONE}, (float16_t*)a_bytes, 1, (float16_t*)b_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes(len_a, b_bytes); - // // Split a into component atoms. - // // (roll shape mul) => 2 x 3 = 6 - // c3_w size = 1; - // u3_atom shp = shape; - // while (u3_nul != shp) { - // shp = u3t(shp); - // size *= shp; - // } + // Clean up. + free(a_bytes); + free(b_bytes); + return u3nc(a_data, r_data); + break; + case 5: + saxpy(len_a, (float32_t){SB_REAL32_ONE}, (float32_t*)a_bytes, 1, (float32_t*)b_bytes, 1); + // Unpack the result back into a noun. + r_data = u3i_bytes(len_a, b_bytes); - // return u3i_word(len_w); + // Clean up. + free(a_bytes); + free(b_bytes); + return u3nc(a_data, r_data); + break; - // union sing c, d, e; - // _set_rounding(r); - // c.c = u3r_word(0, a); - // d.c = u3r_word(0, b); - // e.s = _nan_unify_s(f32_add(c.s, d.s)); + case 6: + daxpy(len_a, (float64_t){SB_REAL64_ONE}, (float64_t*)a_bytes, 1, (float64_t*)b_bytes, 1); - // return u3i_words(1, &e.c); + // Unpack the result back into a noun. + r_data = u3i_bytes(len_a, b_bytes); + + // Clean up. + free(a_bytes); + free(b_bytes); + + return u3nc(a_data, r_data); + break; + + case 7: + qaxpy(len_a, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)a_bytes, 1, (float128_t*)b_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes(len_a, b_bytes); + + // Clean up. + free(a_bytes); + free(b_bytes); + + return u3nc(a_data, r_data); + break; + + default: + return u3_none; + } } u3_noun @@ -97,7 +150,6 @@ // Each argument is a ray, [=meta data=@ux] u3_noun a_meta, a_data, b_meta, b_data; - fprintf(stderr, "\n>> u3wf_la_add\n"); if ( c3n == u3r_mean(cor, u3x_sam_4, &a_meta, @@ -134,7 +186,6 @@ { return u3m_bail(c3__exit); } else { - fprintf(stderr, ">> u3wf_la_add: a_kind: %x\n", a_kind); switch (a_kind) { case c3__real: return u3qf_la_add_real(a_data, b_data, a_shape, a_bloq, rnd); From 1268c90a89c9eb64cb68c68b565e563bb4bda370 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Tue, 5 Mar 2024 15:52:40 -0600 Subject: [PATCH 07/97] Works with SoftBLAS. --- MODULE.bazel | 6 + WORKSPACE.bazel | 2 +- bazel/third_party/softblas/softblas.BUILD | 1 + pkg/noun/jets/f/lagoon.c | 256 ++++++++++++++++++---- pkg/noun/jets/q.h | 3 +- pkg/noun/jets/tree.c | 4 +- pkg/noun/jets/w.h | 1 + 7 files changed, 232 insertions(+), 41 deletions(-) create mode 100644 MODULE.bazel diff --git a/MODULE.bazel b/MODULE.bazel new file mode 100644 index 0000000000..00bb18361f --- /dev/null +++ b/MODULE.bazel @@ -0,0 +1,6 @@ +############################################################################### +# Bazel now uses Bzlmod by default to manage external dependencies. +# Please consider migrating your external dependencies from WORKSPACE to MODULE.bazel. +# +# For more details, please check https://github.com/bazelbuild/bazel/issues/18958 +############################################################################### diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index 85ba2a48b0..3b0296fd0f 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -297,7 +297,7 @@ versioned_http_archive( strip_prefix = "SoftBLAS-{version}", # sha256 = "", url = "https://github.com/urbit/SoftBLAS/archive/{version}.tar.gz", - version = "cbf3dfff5882fd03f28a74c7c0c6ef4c27ec176d", + version = "bace30db3944c0f2bb2b6cac0db9965675ad842e", ) versioned_http_archive( diff --git a/bazel/third_party/softblas/softblas.BUILD b/bazel/third_party/softblas/softblas.BUILD index 2ca6b46b1a..3442c5da45 100644 --- a/bazel/third_party/softblas/softblas.BUILD +++ b/bazel/third_party/softblas/softblas.BUILD @@ -9,6 +9,7 @@ cc_library( includes = ["include"], srcs = [ "include/softblas.h", + "src/softblas_state.c", "src/blas/level1/sasum.c", "src/blas/level1/dasum.c", "src/blas/level1/hasum.c", diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index 3aa27e63d3..3622165141 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -7,6 +7,7 @@ #include "softfloat.h" #include "softblas.h" +#include #include union half { @@ -29,59 +30,75 @@ c3_d c[2]; }; + // $?(%n %u %d %z %a) static inline void _set_rounding(c3_w a) { + // We could use SoftBLAS set_rounding() to set the SoftFloat + // mode as well, but it's more explicit to do it here since + // we may use SoftFloat in any given Lagoon jet and we want + // you, dear developer, to see this set here. + fprintf(stderr, "%x %c\n", a, a); switch ( a ) { default: u3m_bail(c3__fail); break; + // %n - near case c3__n: softfloat_roundingMode = softfloat_round_near_even; + softblas_roundingMode = 'n'; break; + // %z - zero case c3__z: softfloat_roundingMode = softfloat_round_minMag; + softblas_roundingMode = 'z'; break; + // %u - up case c3__u: softfloat_roundingMode = softfloat_round_max; + softblas_roundingMode = 'u'; break; + // %d - down case c3__d: softfloat_roundingMode = softfloat_round_min; + softblas_roundingMode = 'd'; + break; + // %a - away + case c3__a: + softfloat_roundingMode = softfloat_round_near_maxMag; + softblas_roundingMode = 'a'; break; } } /* shape */ - static inline uint64_t _get_shape(u3_noun shape) + static inline uint64_t _get_length(u3_noun shape) { - uint64_t res = 1; + uint64_t len = 1; while (u3_nul != shape) { - res = res * u3h(shape); + len = len * u3h(shape); shape = u3t(shape); } - return res; + return len; } -/* add +/* add - axpy = 1*x+y */ u3_noun u3qf_la_add_real(u3_noun a_data, u3_noun b_data, u3_noun shape, - u3_noun bloq, - u3_noun rnd) + u3_noun bloq) { - - fprintf(stderr, ">> u3qf_la_add_real\n"); - - // Unpack the data as a byte array for SoftBLAS. - uint64_t len_a = _get_shape(shape); - uint8_t* a_bytes = (uint8_t*)malloc(len_a*sizeof(uint8_t)); - u3r_bytes(0, len_a, a_bytes, a_data); - uint8_t* b_bytes = (uint8_t*)malloc(len_a*sizeof(uint8_t)); - u3r_bytes(0, len_a, b_bytes, b_data); + // Unpack the data as a byte array. We assume total length < 2**64. + uint64_t len_a = _get_length(shape); + uint64_t siz_a = len_a * bloq; + uint8_t* a_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); + u3r_bytes(0, siz_a, a_bytes, a_data); + uint8_t* b_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); + u3r_bytes(0, siz_a, b_bytes, b_data); u3_noun r_data; @@ -91,55 +108,145 @@ haxpy(len_a, (float16_t){SB_REAL16_ONE}, (float16_t*)a_bytes, 1, (float16_t*)b_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(len_a, b_bytes); + r_data = u3i_bytes(siz_a*sizeof(uint8_t), b_bytes); // Clean up. - free(a_bytes); - free(b_bytes); + u3a_free(a_bytes); + u3a_free(b_bytes); - return u3nc(a_data, r_data); - break; + return r_data; case 5: saxpy(len_a, (float32_t){SB_REAL32_ONE}, (float32_t*)a_bytes, 1, (float32_t*)b_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(len_a, b_bytes); + r_data = u3i_bytes(siz_a*sizeof(uint8_t), b_bytes); // Clean up. - free(a_bytes); - free(b_bytes); + u3a_free(a_bytes); + u3a_free(b_bytes); - return u3nc(a_data, r_data); - break; + return r_data; case 6: daxpy(len_a, (float64_t){SB_REAL64_ONE}, (float64_t*)a_bytes, 1, (float64_t*)b_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(len_a, b_bytes); + r_data = u3i_bytes(siz_a*sizeof(uint8_t), b_bytes); // Clean up. - free(a_bytes); - free(b_bytes); + u3a_free(a_bytes); + u3a_free(b_bytes); - return u3nc(a_data, r_data); - break; + return r_data; case 7: qaxpy(len_a, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)a_bytes, 1, (float128_t*)b_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(len_a, b_bytes); + r_data = u3i_bytes(siz_a*sizeof(uint8_t), b_bytes); + + // Clean up. + u3a_free(a_bytes); + u3a_free(b_bytes); + + return r_data; + + default: + u3a_free(a_bytes); + u3a_free(b_bytes); + + return u3_none; + } + } + +/* mmul +*/ + u3_noun + u3qf_la_mmul_real(u3_noun a_data, + u3_noun b_data, + u3_noun a_shape, + u3_noun b_shape, + u3_noun bloq) + { + // Unpack the data as a byte array. We assume total length < 2**64. + uint64_t M = u3h(a_shape); + uint64_t Na = u3h(u3t(a_shape)); + uint64_t Nb = u3h(b_shape); + uint64_t P = u3h(u3t(b_shape)); + + assert(u3_nul == u3t(u3t(a_shape))); + assert(Na == Nb); + uint64_t N = Na; + assert(u3_nul == u3t(u3t(b_shape))); + + uint8_t* a_bytes = (uint8_t*)u3a_malloc((M*N)*sizeof(uint8_t)); + u3r_bytes(0, M*N, a_bytes, a_data); + uint8_t* b_bytes = (uint8_t*)u3a_malloc((N*P)*sizeof(uint8_t)); + u3r_bytes(0, N*P, b_bytes, b_data); + uint8_t* c_bytes = (uint8_t*)u3a_malloc((M*P)*sizeof(uint8_t)); + + u3_noun r_data; + + // Switch on the block size. + switch (bloq) { + case 4: + hgemm('N', 'N', M, N, P, (float16_t){SB_REAL16_ONE}, (float16_t*)a_bytes, N, (float16_t*)b_bytes, N, (float16_t){SB_REAL16_ZERO}, (float16_t*)c_bytes, P); + + // Unpack the result back into a noun. + r_data = u3i_bytes(M*P, c_bytes); + + // Clean up. + u3a_free(a_bytes); + u3a_free(b_bytes); + u3a_free(c_bytes); + + return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); + + case 5: + sgemm('N', 'N', M, N, P, (float32_t){SB_REAL32_ONE}, (float32_t*)a_bytes, N, (float32_t*)b_bytes, N, (float32_t){SB_REAL32_ZERO}, (float32_t*)c_bytes, P); + + // Unpack the result back into a noun. + r_data = u3i_bytes(M*P, c_bytes); + + // Clean up. + u3a_free(a_bytes); + u3a_free(b_bytes); + u3a_free(c_bytes); + + return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); + + case 6: + dgemm('N', 'N', M, N, P, (float64_t){SB_REAL64_ONE}, (float64_t*)a_bytes, N, (float64_t*)b_bytes, N, (float64_t){SB_REAL64_ZERO}, (float64_t*)c_bytes, P); + + // Unpack the result back into a noun. + r_data = u3i_bytes(M*P, c_bytes); // Clean up. - free(a_bytes); - free(b_bytes); + u3a_free(a_bytes); + u3a_free(b_bytes); + u3a_free(c_bytes); - return u3nc(a_data, r_data); - break; + return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); + + case 7: + qgemm('N', 'N', M, N, P, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)a_bytes, N, (float128_t*)b_bytes, N, (float128_t){SB_REAL128L_ZERO,SB_REAL128U_ZERO}, (float128_t*)c_bytes, P); + + // Unpack the result back into a noun. + r_data = u3i_bytes(M*P, c_bytes); + + // Clean up. + u3a_free(a_bytes); + u3a_free(b_bytes); + u3a_free(c_bytes); + + return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); default: + u3a_free(a_bytes); + u3a_free(b_bytes); + u3a_free(c_bytes); + return u3_none; } } @@ -181,14 +288,87 @@ c3n == u3r_sing(a_bloq, b_bloq) || c3n == u3r_sing(a_kind, b_kind) || // fxp does not need to match so no check - c3n == u3r_mean(cor, 31, &rnd, 0) + c3n == u3r_mean(cor, 30, &rnd, 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (a_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_add_real(a_data, b_data, a_shape, a_bloq); + return u3nc(u3nq(a_shape, a_bloq, a_kind, a_fxp), r_data); + break; + + // case c3__int2: + // return u3qf_la_add_int2(a_data, b_data, a_shape, a_bloq); + + // case c3__uint: + // return u3qf_la_add_uint(a_data, b_data, a_shape, a_bloq); + + // case c3__cplx: + // _set_rounding(rnd); + // return u3qf_la_add_cplx(a_data, b_data, a_shape, a_bloq); + + // case c3__unum: + // return u3qf_la_add_unum(a_data, b_data, a_shape, a_bloq); + + // case c3__fixp: + // return u3qf_la_add_fixp(a_data, b_data, a_shape, a_bloq); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_mmul(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun a_meta, a_data, + b_meta, b_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &a_meta, + u3x_sam_5, &a_data, + u3x_sam_6, &b_meta, + u3x_sam_7, &b_data, + 0) || + c3n == u3ud(a_data) || + c3n == u3ud(b_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun a_shape, a_bloq, a_kind, a_fxp, + b_shape, b_bloq, b_kind, b_fxp, + rnd; + if ( c3n == u3r_mean(a_meta, + 2, &a_shape, + 6, &a_bloq, + 14, &a_kind, + 15, &a_fxp, + 0) || + c3n == u3r_mean(b_meta, + 2, &b_shape, + 6, &b_bloq, + 14, &b_kind, + 15, &b_fxp, + 0) || + c3n == u3r_sing(a_bloq, b_bloq) || + c3n == u3r_sing(a_kind, b_kind) || + // fxp does not need to match so no check + c3n == u3r_mean(cor, 30, &rnd, 0) ) { return u3m_bail(c3__exit); } else { switch (a_kind) { case c3__real: - return u3qf_la_add_real(a_data, b_data, a_shape, a_bloq, rnd); + _set_rounding(rnd); + return u3qf_la_mmul_real(a_data, b_data, a_shape, b_shape, a_bloq); + break; // case c3__int2: // return u3qf_la_add_int2(a_data, b_data, a_shape, a_bloq); diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index c7cb5ccfd0..ddeed9afcf 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -247,7 +247,8 @@ u3_noun u3qfp_nepo(u3_noun, u3_noun); u3_noun u3qfp_rake(u3_noun); - u3_noun u3qf_la_add_real(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_add_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_mmul_real(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); # define u3qfu_van_fan 28 # define u3qfu_van_rib 58 diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index b6597ffa7f..e70c4073f5 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2152,8 +2152,10 @@ static u3j_core _139_hex_json_d[] = static u3j_core _139_sep_d[] = */ static u3j_harm _139_hex__lagoon_add_a[] = {{".2", u3wf_la_add}, {}}; +static u3j_harm _139_hex__lagoon_mmul_a[] = {{".2", u3wf_la_mmul}, {}}; static u3j_core _139_hex__la_core_d[] = - { { "add-rays", 7, _139_hex__lagoon_add_a, 0, no_hashes }, + { { "add-rays", 7, _139_hex__lagoon_add_a, 0, no_hashes }, + { "mmul", 7, _139_hex__lagoon_mmul_a, 0, no_hashes }, {} }; diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index 5cc40f1975..23aaab9938 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -332,5 +332,6 @@ u3_noun u3wfu_rest(u3_noun); u3_noun u3wf_la_add(u3_noun); + u3_noun u3wf_la_mmul(u3_noun); #endif /* ifndef U3_JETS_W_H */ From 171fa5c121c24f5b2b23dfb0818c9f4da534a721 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Tue, 5 Mar 2024 16:24:07 -0600 Subject: [PATCH 08/97] Post ++sub jet. --- pkg/noun/jets/f/lagoon.c | 320 ++++++++++++++++++++++++++++----------- pkg/noun/jets/q.h | 1 + pkg/noun/jets/tree.c | 2 + pkg/noun/jets/w.h | 1 + 4 files changed, 238 insertions(+), 86 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index 3622165141..07e0e5dd13 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -87,74 +87,150 @@ /* add - axpy = 1*x+y */ u3_noun - u3qf_la_add_real(u3_noun a_data, - u3_noun b_data, + u3qf_la_add_real(u3_noun x_data, + u3_noun y_data, u3_noun shape, u3_noun bloq) { // Unpack the data as a byte array. We assume total length < 2**64. uint64_t len_a = _get_length(shape); uint64_t siz_a = len_a * bloq; - uint8_t* a_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); - u3r_bytes(0, siz_a, a_bytes, a_data); - uint8_t* b_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); - u3r_bytes(0, siz_a, b_bytes, b_data); + uint8_t* x_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); + u3r_bytes(0, siz_a, x_bytes, x_data); + uint8_t* y_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); + u3r_bytes(0, siz_a, y_bytes, y_data); u3_noun r_data; // Switch on the block size. switch (bloq) { case 4: - haxpy(len_a, (float16_t){SB_REAL16_ONE}, (float16_t*)a_bytes, 1, (float16_t*)b_bytes, 1); + haxpy(len_a, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(siz_a*sizeof(uint8_t), b_bytes); + r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); // Clean up. - u3a_free(a_bytes); - u3a_free(b_bytes); + u3a_free(x_bytes); + u3a_free(y_bytes); return r_data; case 5: - saxpy(len_a, (float32_t){SB_REAL32_ONE}, (float32_t*)a_bytes, 1, (float32_t*)b_bytes, 1); + saxpy(len_a, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(siz_a*sizeof(uint8_t), b_bytes); + r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); // Clean up. - u3a_free(a_bytes); - u3a_free(b_bytes); + u3a_free(x_bytes); + u3a_free(y_bytes); return r_data; case 6: - daxpy(len_a, (float64_t){SB_REAL64_ONE}, (float64_t*)a_bytes, 1, (float64_t*)b_bytes, 1); + daxpy(len_a, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(siz_a*sizeof(uint8_t), b_bytes); + r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); // Clean up. - u3a_free(a_bytes); - u3a_free(b_bytes); + u3a_free(x_bytes); + u3a_free(y_bytes); return r_data; case 7: - qaxpy(len_a, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)a_bytes, 1, (float128_t*)b_bytes, 1); + qaxpy(len_a, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(siz_a*sizeof(uint8_t), b_bytes); + r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); // Clean up. - u3a_free(a_bytes); - u3a_free(b_bytes); + u3a_free(x_bytes); + u3a_free(y_bytes); return r_data; default: - u3a_free(a_bytes); - u3a_free(b_bytes); + u3a_free(x_bytes); + u3a_free(y_bytes); + + return u3_none; + } + } + +/* sub - axpy = -1*y+x +*/ + u3_noun + u3qf_la_sub_real(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Unpack the data as a byte array. We assume total length < 2**64. + uint64_t len_a = _get_length(shape); + uint64_t siz_a = len_a * bloq; + uint8_t* x_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); + u3r_bytes(0, siz_a, x_bytes, y_data); + uint8_t* y_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); + u3r_bytes(0, siz_a, y_bytes, x_data); + + u3_noun r_data; + + // Switch on the block size. + switch (bloq) { + case 4: + haxpy(len_a, (float16_t){SB_REAL16_NEGONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + case 5: + saxpy(len_a, (float32_t){SB_REAL32_NEGONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + case 6: + daxpy(len_a, (float64_t){SB_REAL64_NEGONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + case 7: + qaxpy(len_a, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + default: + u3a_free(x_bytes); + u3a_free(y_bytes); return u3_none; } @@ -163,27 +239,27 @@ /* mmul */ u3_noun - u3qf_la_mmul_real(u3_noun a_data, - u3_noun b_data, - u3_noun a_shape, - u3_noun b_shape, + u3qf_la_mmul_real(u3_noun x_data, + u3_noun y_data, + u3_noun x_shape, + u3_noun y_shape, u3_noun bloq) { // Unpack the data as a byte array. We assume total length < 2**64. - uint64_t M = u3h(a_shape); - uint64_t Na = u3h(u3t(a_shape)); - uint64_t Nb = u3h(b_shape); - uint64_t P = u3h(u3t(b_shape)); + uint64_t M = u3h(x_shape); + uint64_t Na = u3h(u3t(x_shape)); + uint64_t Nb = u3h(y_shape); + uint64_t P = u3h(u3t(y_shape)); - assert(u3_nul == u3t(u3t(a_shape))); + assert(u3_nul == u3t(u3t(x_shape))); assert(Na == Nb); uint64_t N = Na; - assert(u3_nul == u3t(u3t(b_shape))); + assert(u3_nul == u3t(u3t(y_shape))); - uint8_t* a_bytes = (uint8_t*)u3a_malloc((M*N)*sizeof(uint8_t)); - u3r_bytes(0, M*N, a_bytes, a_data); - uint8_t* b_bytes = (uint8_t*)u3a_malloc((N*P)*sizeof(uint8_t)); - u3r_bytes(0, N*P, b_bytes, b_data); + uint8_t* x_bytes = (uint8_t*)u3a_malloc((M*N)*sizeof(uint8_t)); + u3r_bytes(0, M*N, x_bytes, x_data); + uint8_t* y_bytes = (uint8_t*)u3a_malloc((N*P)*sizeof(uint8_t)); + u3r_bytes(0, N*P, y_bytes, y_data); uint8_t* c_bytes = (uint8_t*)u3a_malloc((M*P)*sizeof(uint8_t)); u3_noun r_data; @@ -191,60 +267,60 @@ // Switch on the block size. switch (bloq) { case 4: - hgemm('N', 'N', M, N, P, (float16_t){SB_REAL16_ONE}, (float16_t*)a_bytes, N, (float16_t*)b_bytes, N, (float16_t){SB_REAL16_ZERO}, (float16_t*)c_bytes, P); + hgemm('N', 'N', M, N, P, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, N, (float16_t*)y_bytes, N, (float16_t){SB_REAL16_ZERO}, (float16_t*)c_bytes, P); // Unpack the result back into a noun. r_data = u3i_bytes(M*P, c_bytes); // Clean up. - u3a_free(a_bytes); - u3a_free(b_bytes); + u3a_free(x_bytes); + u3a_free(y_bytes); u3a_free(c_bytes); return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); case 5: - sgemm('N', 'N', M, N, P, (float32_t){SB_REAL32_ONE}, (float32_t*)a_bytes, N, (float32_t*)b_bytes, N, (float32_t){SB_REAL32_ZERO}, (float32_t*)c_bytes, P); + sgemm('N', 'N', M, N, P, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, N, (float32_t*)y_bytes, N, (float32_t){SB_REAL32_ZERO}, (float32_t*)c_bytes, P); // Unpack the result back into a noun. r_data = u3i_bytes(M*P, c_bytes); // Clean up. - u3a_free(a_bytes); - u3a_free(b_bytes); + u3a_free(x_bytes); + u3a_free(y_bytes); u3a_free(c_bytes); return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); case 6: - dgemm('N', 'N', M, N, P, (float64_t){SB_REAL64_ONE}, (float64_t*)a_bytes, N, (float64_t*)b_bytes, N, (float64_t){SB_REAL64_ZERO}, (float64_t*)c_bytes, P); + dgemm('N', 'N', M, N, P, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, N, (float64_t*)y_bytes, N, (float64_t){SB_REAL64_ZERO}, (float64_t*)c_bytes, P); // Unpack the result back into a noun. r_data = u3i_bytes(M*P, c_bytes); // Clean up. - u3a_free(a_bytes); - u3a_free(b_bytes); + u3a_free(x_bytes); + u3a_free(y_bytes); u3a_free(c_bytes); return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); case 7: - qgemm('N', 'N', M, N, P, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)a_bytes, N, (float128_t*)b_bytes, N, (float128_t){SB_REAL128L_ZERO,SB_REAL128U_ZERO}, (float128_t*)c_bytes, P); + qgemm('N', 'N', M, N, P, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, N, (float128_t*)y_bytes, N, (float128_t){SB_REAL128L_ZERO,SB_REAL128U_ZERO}, (float128_t*)c_bytes, P); // Unpack the result back into a noun. r_data = u3i_bytes(M*P, c_bytes); // Clean up. - u3a_free(a_bytes); - u3a_free(b_bytes); + u3a_free(x_bytes); + u3a_free(y_bytes); u3a_free(c_bytes); return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); default: - u3a_free(a_bytes); - u3a_free(b_bytes); + u3a_free(x_bytes); + u3a_free(y_bytes); u3a_free(c_bytes); return u3_none; @@ -255,36 +331,108 @@ u3wf_la_add(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, a_data, - b_meta, b_data; + u3_noun a_meta, x_data, + b_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &a_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &b_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, a_bloq, a_kind, a_fxp, + y_shape, b_bloq, b_kind, b_fxp, + rnd; + if ( c3n == u3r_mean(a_meta, + 2, &x_shape, + 6, &a_bloq, + 14, &a_kind, + 15, &a_fxp, + 0) || + c3n == u3r_mean(b_meta, + 2, &y_shape, + 6, &b_bloq, + 14, &b_kind, + 15, &b_fxp, + 0) || + c3n == u3r_sing(x_shape, y_shape) || + c3n == u3r_sing(a_bloq, b_bloq) || + c3n == u3r_sing(a_kind, b_kind) || + // fxp does not need to match so no check + c3n == u3r_mean(cor, 30, &rnd, 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (a_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_add_real(x_data, y_data, x_shape, a_bloq); + return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); + break; + + // case c3__int2: + // return u3qf_la_add_int2(x_data, y_data, x_shape, a_bloq); + + // case c3__uint: + // return u3qf_la_add_uint(x_data, y_data, x_shape, a_bloq); + + // case c3__cplx: + // _set_rounding(rnd); + // return u3qf_la_add_cplx(x_data, y_data, x_shape, a_bloq); + + // case c3__unum: + // return u3qf_la_add_unum(x_data, y_data, x_shape, a_bloq); + + // case c3__fixp: + // return u3qf_la_add_fixp(x_data, y_data, x_shape, a_bloq); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_sub(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun a_meta, x_data, + b_meta, y_data; if ( c3n == u3r_mean(cor, u3x_sam_4, &a_meta, - u3x_sam_5, &a_data, + u3x_sam_5, &x_data, u3x_sam_6, &b_meta, - u3x_sam_7, &b_data, + u3x_sam_7, &y_data, 0) || - c3n == u3ud(a_data) || - c3n == u3ud(b_data) ) + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) { return u3m_bail(c3__exit); } else { - u3_noun a_shape, a_bloq, a_kind, a_fxp, - b_shape, b_bloq, b_kind, b_fxp, + u3_noun x_shape, a_bloq, a_kind, a_fxp, + y_shape, b_bloq, b_kind, b_fxp, rnd; if ( c3n == u3r_mean(a_meta, - 2, &a_shape, + 2, &x_shape, 6, &a_bloq, 14, &a_kind, 15, &a_fxp, 0) || c3n == u3r_mean(b_meta, - 2, &b_shape, + 2, &y_shape, 6, &b_bloq, 14, &b_kind, 15, &b_fxp, 0) || - c3n == u3r_sing(a_shape, b_shape) || + c3n == u3r_sing(x_shape, y_shape) || c3n == u3r_sing(a_bloq, b_bloq) || c3n == u3r_sing(a_kind, b_kind) || // fxp does not need to match so no check @@ -296,25 +444,25 @@ switch (a_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_add_real(a_data, b_data, a_shape, a_bloq); - return u3nc(u3nq(a_shape, a_bloq, a_kind, a_fxp), r_data); + u3_noun r_data = u3qf_la_sub_real(x_data, y_data, x_shape, a_bloq); + return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); break; // case c3__int2: - // return u3qf_la_add_int2(a_data, b_data, a_shape, a_bloq); + // return u3qf_la_sub_int2(x_data, y_data, x_shape, a_bloq); // case c3__uint: - // return u3qf_la_add_uint(a_data, b_data, a_shape, a_bloq); + // return u3qf_la_sub_uint(x_data, y_data, x_shape, a_bloq); // case c3__cplx: // _set_rounding(rnd); - // return u3qf_la_add_cplx(a_data, b_data, a_shape, a_bloq); + // return u3qf_la_sub_cplx(x_data, y_data, x_shape, a_bloq); // case c3__unum: - // return u3qf_la_add_unum(a_data, b_data, a_shape, a_bloq); + // return u3qf_la_sub_unum(x_data, y_data, x_shape, a_bloq); // case c3__fixp: - // return u3qf_la_add_fixp(a_data, b_data, a_shape, a_bloq); + // return u3qf_la_sub_fixp(x_data, y_data, x_shape, a_bloq); default: return u3_none; @@ -327,31 +475,31 @@ u3wf_la_mmul(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, a_data, - b_meta, b_data; + u3_noun a_meta, x_data, + b_meta, y_data; if ( c3n == u3r_mean(cor, u3x_sam_4, &a_meta, - u3x_sam_5, &a_data, + u3x_sam_5, &x_data, u3x_sam_6, &b_meta, - u3x_sam_7, &b_data, + u3x_sam_7, &y_data, 0) || - c3n == u3ud(a_data) || - c3n == u3ud(b_data) ) + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) { return u3m_bail(c3__exit); } else { - u3_noun a_shape, a_bloq, a_kind, a_fxp, - b_shape, b_bloq, b_kind, b_fxp, + u3_noun x_shape, a_bloq, a_kind, a_fxp, + y_shape, b_bloq, b_kind, b_fxp, rnd; if ( c3n == u3r_mean(a_meta, - 2, &a_shape, + 2, &x_shape, 6, &a_bloq, 14, &a_kind, 15, &a_fxp, 0) || c3n == u3r_mean(b_meta, - 2, &b_shape, + 2, &y_shape, 6, &b_bloq, 14, &b_kind, 15, &b_fxp, @@ -367,23 +515,23 @@ switch (a_kind) { case c3__real: _set_rounding(rnd); - return u3qf_la_mmul_real(a_data, b_data, a_shape, b_shape, a_bloq); + return u3qf_la_mmul_real(x_data, y_data, x_shape, y_shape, a_bloq); break; // case c3__int2: - // return u3qf_la_add_int2(a_data, b_data, a_shape, a_bloq); + // return u3qf_la_add_int2(x_data, y_data, x_shape, a_bloq); // case c3__uint: - // return u3qf_la_add_uint(a_data, b_data, a_shape, a_bloq); + // return u3qf_la_add_uint(x_data, y_data, x_shape, a_bloq); // case c3__cplx: - // return u3qf_la_add_cplx(a_data, b_data, a_shape, a_bloq, rnd); + // return u3qf_la_add_cplx(x_data, y_data, x_shape, a_bloq, rnd); // case c3__unum: - // return u3qf_la_add_unum(a_data, b_data, a_shape, a_bloq); + // return u3qf_la_add_unum(x_data, y_data, x_shape, a_bloq); // case c3__fixp: - // return u3qf_la_add_fixp(a_data, b_data, a_shape, a_bloq); + // return u3qf_la_add_fixp(x_data, y_data, x_shape, a_bloq); default: return u3_none; diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index ddeed9afcf..371cebb228 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -248,6 +248,7 @@ u3_noun u3qfp_rake(u3_noun); u3_noun u3qf_la_add_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_sub_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_mmul_real(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); # define u3qfu_van_fan 28 diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index e70c4073f5..035dad2305 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2152,9 +2152,11 @@ static u3j_core _139_hex_json_d[] = static u3j_core _139_sep_d[] = */ static u3j_harm _139_hex__lagoon_add_a[] = {{".2", u3wf_la_add}, {}}; +static u3j_harm _139_hex__lagoon_sub_a[] = {{".2", u3wf_la_sub}, {}}; static u3j_harm _139_hex__lagoon_mmul_a[] = {{".2", u3wf_la_mmul}, {}}; static u3j_core _139_hex__la_core_d[] = { { "add-rays", 7, _139_hex__lagoon_add_a, 0, no_hashes }, + { "sub-rays", 7, _139_hex__lagoon_sub_a, 0, no_hashes }, { "mmul", 7, _139_hex__lagoon_mmul_a, 0, no_hashes }, {} }; diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index 23aaab9938..f7d4ef6a53 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -332,6 +332,7 @@ u3_noun u3wfu_rest(u3_noun); u3_noun u3wf_la_add(u3_noun); + u3_noun u3wf_la_sub(u3_noun); u3_noun u3wf_la_mmul(u3_noun); #endif /* ifndef U3_JETS_W_H */ From 315b4f5249326d78e53a8d4a435ee46aaf3c569c Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 27 Mar 2024 10:18:41 -0500 Subject: [PATCH 09/97] Post fixed scalar jets using ?scal. --- pkg/noun/jets/f/lagoon.c | 1257 +++++++++++++++++++++++++++++++------- pkg/noun/jets/q.h | 6 + pkg/noun/jets/tree.c | 12 + pkg/noun/jets/w.h | 6 + 4 files changed, 1051 insertions(+), 230 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index 07e0e5dd13..ba55c5b4db 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -8,6 +8,7 @@ #include "softblas.h" #include +#include // for pow() #include union half { @@ -94,11 +95,11 @@ { // Unpack the data as a byte array. We assume total length < 2**64. uint64_t len_a = _get_length(shape); - uint64_t siz_a = len_a * bloq; + uint64_t siz_a = len_a * pow(2, bloq - 3); uint8_t* x_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); u3r_bytes(0, siz_a, x_bytes, x_data); - uint8_t* y_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); - u3r_bytes(0, siz_a, y_bytes, y_data); + uint8_t* y_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); + u3r_bytes(0, siz_a+1, y_bytes, y_data); u3_noun r_data; @@ -108,7 +109,7 @@ haxpy(len_a, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); // Clean up. u3a_free(x_bytes); @@ -120,7 +121,7 @@ saxpy(len_a, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); // Clean up. u3a_free(x_bytes); @@ -132,7 +133,7 @@ daxpy(len_a, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); // Clean up. u3a_free(x_bytes); @@ -144,7 +145,7 @@ qaxpy(len_a, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); // Clean up. u3a_free(x_bytes); @@ -170,11 +171,11 @@ { // Unpack the data as a byte array. We assume total length < 2**64. uint64_t len_a = _get_length(shape); - uint64_t siz_a = len_a * bloq; + uint64_t siz_a = len_a * pow(2, bloq - 3); uint8_t* x_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); - u3r_bytes(0, siz_a, x_bytes, y_data); - uint8_t* y_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); - u3r_bytes(0, siz_a, y_bytes, x_data); + u3r_bytes(0, siz_a, x_bytes, y_data); // XXX + uint8_t* y_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); + u3r_bytes(0, siz_a+1, y_bytes, x_data); // XXX u3_noun r_data; @@ -184,7 +185,7 @@ haxpy(len_a, (float16_t){SB_REAL16_NEGONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); // Clean up. u3a_free(x_bytes); @@ -196,7 +197,7 @@ saxpy(len_a, (float32_t){SB_REAL32_NEGONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); // Clean up. u3a_free(x_bytes); @@ -208,7 +209,7 @@ daxpy(len_a, (float64_t){SB_REAL64_NEGONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); // Clean up. u3a_free(x_bytes); @@ -220,7 +221,7 @@ qaxpy(len_a, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); // Unpack the result back into a noun. - r_data = u3i_bytes(siz_a*sizeof(uint8_t), y_bytes); + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); // Clean up. u3a_free(x_bytes); @@ -236,302 +237,1098 @@ } } -/* mmul +/* mul - x.*y + elementwise multiplication */ u3_noun - u3qf_la_mmul_real(u3_noun x_data, - u3_noun y_data, - u3_noun x_shape, - u3_noun y_shape, - u3_noun bloq) + u3qf_la_mul_real(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) { // Unpack the data as a byte array. We assume total length < 2**64. - uint64_t M = u3h(x_shape); - uint64_t Na = u3h(u3t(x_shape)); - uint64_t Nb = u3h(y_shape); - uint64_t P = u3h(u3t(y_shape)); - - assert(u3_nul == u3t(u3t(x_shape))); - assert(Na == Nb); - uint64_t N = Na; - assert(u3_nul == u3t(u3t(y_shape))); - - uint8_t* x_bytes = (uint8_t*)u3a_malloc((M*N)*sizeof(uint8_t)); - u3r_bytes(0, M*N, x_bytes, x_data); - uint8_t* y_bytes = (uint8_t*)u3a_malloc((N*P)*sizeof(uint8_t)); - u3r_bytes(0, N*P, y_bytes, y_data); - uint8_t* c_bytes = (uint8_t*)u3a_malloc((M*P)*sizeof(uint8_t)); + uint64_t len_a = _get_length(shape); + uint64_t siz_a = len_a * pow(2, bloq - 3); + uint8_t* x_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); + u3r_bytes(0, siz_a, x_bytes, x_data); + uint8_t* y_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); + u3r_bytes(0, siz_a+1, y_bytes, y_data); u3_noun r_data; // Switch on the block size. switch (bloq) { case 4: - hgemm('N', 'N', M, N, P, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, N, (float16_t*)y_bytes, N, (float16_t){SB_REAL16_ZERO}, (float16_t*)c_bytes, P); + for (uint64_t i = 0; i < len_a; i++) { + ((float16_t*)y_bytes)[i] = f16_mul(((float16_t*)x_bytes)[i], ((float16_t*)y_bytes)[i]); + } // Unpack the result back into a noun. - r_data = u3i_bytes(M*P, c_bytes); + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); // Clean up. u3a_free(x_bytes); u3a_free(y_bytes); - u3a_free(c_bytes); - return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); + return r_data; case 5: - sgemm('N', 'N', M, N, P, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, N, (float32_t*)y_bytes, N, (float32_t){SB_REAL32_ZERO}, (float32_t*)c_bytes, P); + for (uint64_t i = 0; i < len_a; i++) { + ((float32_t*)y_bytes)[i] = f32_mul(((float32_t*)x_bytes)[i], ((float32_t*)y_bytes)[i]); + } // Unpack the result back into a noun. - r_data = u3i_bytes(M*P, c_bytes); + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); // Clean up. u3a_free(x_bytes); u3a_free(y_bytes); - u3a_free(c_bytes); - return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); + return r_data; case 6: - dgemm('N', 'N', M, N, P, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, N, (float64_t*)y_bytes, N, (float64_t){SB_REAL64_ZERO}, (float64_t*)c_bytes, P); + for (uint64_t i = 0; i < len_a; i++) { + ((float64_t*)y_bytes)[i] = f64_mul(((float64_t*)x_bytes)[i], ((float64_t*)y_bytes)[i]); + } // Unpack the result back into a noun. - r_data = u3i_bytes(M*P, c_bytes); + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); // Clean up. u3a_free(x_bytes); u3a_free(y_bytes); - u3a_free(c_bytes); - return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); + return r_data; case 7: - qgemm('N', 'N', M, N, P, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, N, (float128_t*)y_bytes, N, (float128_t){SB_REAL128L_ZERO,SB_REAL128U_ZERO}, (float128_t*)c_bytes, P); + for (uint64_t i = 0; i < len_a; i++) { + f128M_mul(&(((float128_t*)y_bytes)[i]), &(((float128_t*)x_bytes)[i]), &(((float128_t*)y_bytes)[i])); + } // Unpack the result back into a noun. - r_data = u3i_bytes(M*P, c_bytes); + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); // Clean up. u3a_free(x_bytes); u3a_free(y_bytes); - u3a_free(c_bytes); - return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); + return r_data; default: u3a_free(x_bytes); u3a_free(y_bytes); - u3a_free(c_bytes); - + return u3_none; } } +/* div - x/y + elementwise division +*/ u3_noun - u3wf_la_add(u3_noun cor) + u3qf_la_div_real(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) { - // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, x_data, - b_meta, y_data; - - if ( c3n == u3r_mean(cor, - u3x_sam_4, &a_meta, - u3x_sam_5, &x_data, - u3x_sam_6, &b_meta, - u3x_sam_7, &y_data, - 0) || - c3n == u3ud(x_data) || - c3n == u3ud(y_data) ) - { - return u3m_bail(c3__exit); - } else { - u3_noun x_shape, a_bloq, a_kind, a_fxp, - y_shape, b_bloq, b_kind, b_fxp, - rnd; - if ( c3n == u3r_mean(a_meta, - 2, &x_shape, - 6, &a_bloq, - 14, &a_kind, - 15, &a_fxp, - 0) || - c3n == u3r_mean(b_meta, - 2, &y_shape, - 6, &b_bloq, - 14, &b_kind, - 15, &b_fxp, - 0) || - c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(a_bloq, b_bloq) || - c3n == u3r_sing(a_kind, b_kind) || - // fxp does not need to match so no check - c3n == u3r_mean(cor, 30, &rnd, 0) - ) - { - return u3m_bail(c3__exit); - } else { - switch (a_kind) { - case c3__real: - _set_rounding(rnd); - u3_noun r_data = u3qf_la_add_real(x_data, y_data, x_shape, a_bloq); - return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); - break; + // Unpack the data as a byte array. We assume total length < 2**64. + uint64_t len_a = _get_length(shape); + uint64_t siz_a = len_a * pow(2, bloq - 3); + uint8_t* x_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); + u3r_bytes(0, siz_a, x_bytes, x_data); + uint8_t* y_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); + u3r_bytes(0, siz_a+1, y_bytes, y_data); - // case c3__int2: - // return u3qf_la_add_int2(x_data, y_data, x_shape, a_bloq); + u3_noun r_data; - // case c3__uint: - // return u3qf_la_add_uint(x_data, y_data, x_shape, a_bloq); + // Switch on the block size. + switch (bloq) { + case 4: + for (uint64_t i = 0; i < len_a; i++) { + ((float16_t*)y_bytes)[i] = f16_div(((float16_t*)x_bytes)[i], ((float16_t*)y_bytes)[i]); + } - // case c3__cplx: - // _set_rounding(rnd); - // return u3qf_la_add_cplx(x_data, y_data, x_shape, a_bloq); + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - // case c3__unum: - // return u3qf_la_add_unum(x_data, y_data, x_shape, a_bloq); + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); - // case c3__fixp: - // return u3qf_la_add_fixp(x_data, y_data, x_shape, a_bloq); + return r_data; - default: - return u3_none; + case 5: + for (uint64_t i = 0; i < len_a; i++) { + ((float32_t*)y_bytes)[i] = f32_div(((float32_t*)x_bytes)[i], ((float32_t*)y_bytes)[i]); } - } - } - } - u3_noun - u3wf_la_sub(u3_noun cor) - { - // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, x_data, - b_meta, y_data; + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - if ( c3n == u3r_mean(cor, - u3x_sam_4, &a_meta, - u3x_sam_5, &x_data, - u3x_sam_6, &b_meta, - u3x_sam_7, &y_data, - 0) || - c3n == u3ud(x_data) || - c3n == u3ud(y_data) ) - { - return u3m_bail(c3__exit); - } else { - u3_noun x_shape, a_bloq, a_kind, a_fxp, - y_shape, b_bloq, b_kind, b_fxp, - rnd; - if ( c3n == u3r_mean(a_meta, - 2, &x_shape, - 6, &a_bloq, - 14, &a_kind, - 15, &a_fxp, - 0) || - c3n == u3r_mean(b_meta, - 2, &y_shape, - 6, &b_bloq, - 14, &b_kind, - 15, &b_fxp, - 0) || - c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(a_bloq, b_bloq) || - c3n == u3r_sing(a_kind, b_kind) || - // fxp does not need to match so no check - c3n == u3r_mean(cor, 30, &rnd, 0) - ) - { - return u3m_bail(c3__exit); - } else { - switch (a_kind) { - case c3__real: - _set_rounding(rnd); - u3_noun r_data = u3qf_la_sub_real(x_data, y_data, x_shape, a_bloq); - return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); - break; + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); - // case c3__int2: - // return u3qf_la_sub_int2(x_data, y_data, x_shape, a_bloq); + return r_data; - // case c3__uint: - // return u3qf_la_sub_uint(x_data, y_data, x_shape, a_bloq); + case 6: + for (uint64_t i = 0; i < len_a; i++) { + ((float64_t*)y_bytes)[i] = f64_div(((float64_t*)x_bytes)[i], ((float64_t*)y_bytes)[i]); + } - // case c3__cplx: - // _set_rounding(rnd); - // return u3qf_la_sub_cplx(x_data, y_data, x_shape, a_bloq); + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - // case c3__unum: - // return u3qf_la_sub_unum(x_data, y_data, x_shape, a_bloq); + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); - // case c3__fixp: - // return u3qf_la_sub_fixp(x_data, y_data, x_shape, a_bloq); + return r_data; - default: - return u3_none; + case 7: + for (uint64_t i = 0; i < len_a; i++) { + f128M_div(&(((float128_t*)y_bytes)[i]), &(((float128_t*)x_bytes)[i]), &(((float128_t*)y_bytes)[i])); } - } + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + default: + u3a_free(x_bytes); + u3a_free(y_bytes); + + return u3_none; } } +/* adds - axpy = 1*x+n +*/ u3_noun - u3wf_la_mmul(u3_noun cor) + u3qf_la_adds_real(u3_noun x_data, + u3_noun shape, + u3_noun bloq, + u3_noun n) { - // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, x_data, - b_meta, y_data; + // Unpack the data as a byte array. We assume total length < 2**64. + uint64_t len_a = _get_length(shape); + uint64_t siz_a = len_a * pow(2, bloq - 3); + uint8_t* x_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); + u3r_bytes(0, siz_a, x_bytes, x_data); + uint8_t* y_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); - if ( c3n == u3r_mean(cor, - u3x_sam_4, &a_meta, - u3x_sam_5, &x_data, - u3x_sam_6, &b_meta, - u3x_sam_7, &y_data, - 0) || - c3n == u3ud(x_data) || - c3n == u3ud(y_data) ) - { - return u3m_bail(c3__exit); - } else { - u3_noun x_shape, a_bloq, a_kind, a_fxp, - y_shape, b_bloq, b_kind, b_fxp, - rnd; - if ( c3n == u3r_mean(a_meta, - 2, &x_shape, - 6, &a_bloq, - 14, &a_kind, - 15, &a_fxp, - 0) || - c3n == u3r_mean(b_meta, - 2, &y_shape, - 6, &b_bloq, - 14, &b_kind, - 15, &b_fxp, - 0) || - c3n == u3r_sing(a_bloq, b_bloq) || - c3n == u3r_sing(a_kind, b_kind) || - // fxp does not need to match so no check - c3n == u3r_mean(cor, 30, &rnd, 0) - ) - { - return u3m_bail(c3__exit); - } else { - switch (a_kind) { - case c3__real: - _set_rounding(rnd); - return u3qf_la_mmul_real(x_data, y_data, x_shape, y_shape, a_bloq); - break; + float16_t n16; + float32_t n32; + float64_t n64; + float128_t n128; - // case c3__int2: - // return u3qf_la_add_int2(x_data, y_data, x_shape, a_bloq); + u3_noun r_data; - // case c3__uint: - // return u3qf_la_add_uint(x_data, y_data, x_shape, a_bloq); + // Switch on the block size. We assume that n fits in the target block size; Hoon typecheck should prevent. + switch (bloq) { + case 4: + u3r_bytes(0, 2, (uint8_t*)&n16, n); + // set y to [n] + for (uint64_t i = 0; i < len_a; i++) { + ((float16_t*)y_bytes)[i] = n16; + } + y_bytes[siz_a] = 1; // pin head + haxpy(len_a, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); - // case c3__cplx: - // return u3qf_la_add_cplx(x_data, y_data, x_shape, a_bloq, rnd); + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + case 5: + u3r_bytes(0, 4, (uint8_t*)&n32, n); + // set y to [n] + for (uint64_t i = 0; i < len_a; i++) { + ((float32_t*)y_bytes)[i] = n32; + } + y_bytes[siz_a] = 1; // pin head + saxpy(len_a, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + case 6: + u3r_bytes(0, 8, (uint8_t*)&n64, n); + // set y to [n] + for (uint64_t i = 0; i < len_a; i++) { + ((float64_t*)y_bytes)[i] = n64; + } + y_bytes[siz_a] = 1; // pin head + daxpy(len_a, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + case 7: + u3r_bytes(0, 16, (uint8_t*)&n128, n); + // set y to [n] + for (uint64_t i = 0; i < len_a; i++) { + ((float128_t*)y_bytes)[i] = (float128_t){n128.v[0], n128.v[1]}; + } + y_bytes[siz_a] = 1; // pin head + qaxpy(len_a, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + default: + u3a_free(x_bytes); + u3a_free(y_bytes); + + return u3_none; + } + } + +/* subs - axpy = -1*n+x +*/ + u3_noun + u3qf_la_subs_real(u3_noun x_data, + u3_noun shape, + u3_noun bloq, + u3_noun n) + { + // Unpack the data as a byte array. We assume total length < 2**64. + uint64_t len_a = _get_length(shape); + uint64_t siz_a = len_a * pow(2, bloq - 3); + uint8_t* x_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); + u3r_bytes(0, siz_a+1, x_bytes, x_data); + uint8_t* y_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); + + float16_t n16; + float32_t n32; + float64_t n64; + float128_t n128; + + u3_noun r_data; + + // Switch on the block size. We assume that n fits in the target block size; Hoon typecheck should prevent. + switch (bloq) { + case 4: + u3r_bytes(0, 2, (uint8_t*)&n16, n); + // set y to [n] + for (uint64_t i = 0; i < len_a; i++) { + ((float16_t*)y_bytes)[i] = n16; + } + haxpy(len_a, (float16_t){SB_REAL16_NEGONE}, (float16_t*)y_bytes, 1, (float16_t*)x_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + case 5: + u3r_bytes(0, 4, (uint8_t*)&n32, n); + // set y to [n] + for (uint64_t i = 0; i < len_a; i++) { + ((float32_t*)y_bytes)[i] = n32; + } + saxpy(len_a, (float32_t){SB_REAL32_NEGONE}, (float32_t*)y_bytes, 1, (float32_t*)x_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + case 6: + u3r_bytes(0, 8, (uint8_t*)&n64, n); + // set y to [n] + for (uint64_t i = 0; i < len_a; i++) { + ((float64_t*)y_bytes)[i] = n64; + } + daxpy(len_a, (float64_t){SB_REAL64_NEGONE}, (float64_t*)y_bytes, 1, (float64_t*)x_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + case 7: + u3r_bytes(0, 16, (uint8_t*)&n128, n); + // set y to [n] + for (uint64_t i = 0; i < len_a; i++) { + ((float128_t*)y_bytes)[i] = (float128_t){n128.v[0], n128.v[1]}; + } + qaxpy(len_a, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)y_bytes, 1, (float128_t*)x_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + default: + u3a_free(x_bytes); + u3a_free(y_bytes); + + return u3_none; + } + } + +/* muls - x.*[n] + elementwise multiplication +*/ + u3_noun + u3qf_la_muls_real(u3_noun x_data, + u3_noun shape, + u3_noun bloq, + u3_noun n) + { + // Unpack the data as a byte array. We assume total length < 2**64. + uint64_t len_a = _get_length(shape); + uint64_t siz_a = len_a * pow(2, bloq - 3); + uint8_t* x_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); + u3r_bytes(0, siz_a, x_bytes, x_data); + x_bytes[siz_a] = 1; // pin head + + float16_t n16; + float32_t n32; + float64_t n64; + float128_t n128; + + u3_noun r_data; + + // Switch on the block size. + switch (bloq) { + case 4: + u3r_bytes(0, 2, (uint8_t*)&n16, n); + hscal(len_a, n16, (float16_t*)x_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); + + // Clean up. + u3a_free(x_bytes); + + return r_data; + + case 5: + u3r_bytes(0, 4, (uint8_t*)&n32, n); + sscal(len_a, n32, (float32_t*)x_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); + + // Clean up. + u3a_free(x_bytes); + + return r_data; + + case 6: + u3r_bytes(0, 8, (uint8_t*)&n64, n); + dscal(len_a, n64, (float64_t*)x_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); + + // Clean up. + u3a_free(x_bytes); + + return r_data; + + case 7: + u3r_bytes(0, 16, (uint8_t*)&(n128.v[0]), n); + qscal(len_a, n128, (float128_t*)x_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); + + // Clean up. + u3a_free(x_bytes); + + return r_data; + + default: + u3a_free(x_bytes); + + return u3_none; + } + } + +/* divs - x/[n] + elementwise multiplication +*/ + u3_noun + u3qf_la_divs_real(u3_noun x_data, + u3_noun shape, + u3_noun bloq, + u3_noun n) + { + // Unpack the data as a byte array. We assume total length < 2**64. + uint64_t len_a = _get_length(shape); + uint64_t siz_a = len_a * pow(2, bloq - 3); + uint8_t* x_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); + u3r_bytes(0, siz_a, x_bytes, x_data); + x_bytes[siz_a] = 1; // pin head + + float16_t n16; + float32_t n32; + float64_t n64; + float128_t n128; + + u3_noun r_data; + + // Switch on the block size. + switch (bloq) { + case 4: + u3r_bytes(0, 2, (uint8_t*)&n16, n); + n16 = f16_div((float16_t){SB_REAL16_ONE}, n16); + hscal(len_a, n16, (float16_t*)x_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); + + // Clean up. + u3a_free(x_bytes); + + return r_data; + + case 5: + u3r_bytes(0, 4, (uint8_t*)&n32, n); + n32 = f32_div((float32_t){SB_REAL32_ONE}, n32); + sscal(len_a, n32, (float32_t*)x_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); + + // Clean up. + u3a_free(x_bytes); + + return r_data; + + case 6: + u3r_bytes(0, 8, (uint8_t*)&n64, n); + n64 = f64_div((float64_t){SB_REAL64_ONE}, n64); + dscal(len_a, n64, (float64_t*)x_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); + + // Clean up. + u3a_free(x_bytes); + + return r_data; + + case 7: + // u3r_bytes(0, 16, (uint8_t*)&(n128.v[0]), n); + u3l_log("divs: n", n); + u3r_bytes(0, 16, (uint8_t*)&n128, n); + fprintf(stderr, "n128: %lx %lx\r\n", n128.v[0], n128.v[1]); + f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}), &n128, &n128); + fprintf(stderr, "one: %lx %lx\r\n", SB_REAL128L_ONE, SB_REAL128U_ONE); + fprintf(stderr, "n128: %lx %lx\r\n", n128.v[0], n128.v[1]); + qscal(len_a, n128, (float128_t*)x_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); + + // Clean up. + u3a_free(x_bytes); + + return r_data; + + default: + u3a_free(x_bytes); + + return u3_none; + } + } + +/* mmul +*/ + u3_noun + u3qf_la_mmul_real(u3_noun x_data, + u3_noun y_data, + u3_noun x_shape, + u3_noun y_shape, + u3_noun bloq) + { + // Unpack the data as a byte array. We assume total length < 2**64. + uint64_t M = u3h(x_shape); + uint64_t Na = u3h(u3t(x_shape)); + uint64_t Nb = u3h(y_shape); + uint64_t P = u3h(u3t(y_shape)); + + assert(u3_nul == u3t(u3t(x_shape))); + assert(Na == Nb); + uint64_t N = Na; + assert(u3_nul == u3t(u3t(y_shape))); + + uint8_t* x_bytes = (uint8_t*)u3a_malloc((M*N)*sizeof(uint8_t)); + u3r_bytes(0, M*N, x_bytes, x_data); + uint8_t* y_bytes = (uint8_t*)u3a_malloc((N*P)*sizeof(uint8_t)); + u3r_bytes(0, N*P, y_bytes, y_data); + uint8_t* c_bytes = (uint8_t*)u3a_malloc((M*P)*sizeof(uint8_t)); + + u3_noun r_data; + + // Switch on the block size. + switch (bloq) { + case 4: + hgemm('N', 'N', M, N, P, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, N, (float16_t*)y_bytes, N, (float16_t){SB_REAL16_ZERO}, (float16_t*)c_bytes, P); + + // Unpack the result back into a noun. + r_data = u3i_bytes(M*P, c_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + u3a_free(c_bytes); + + return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); + + case 5: + sgemm('N', 'N', M, N, P, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, N, (float32_t*)y_bytes, N, (float32_t){SB_REAL32_ZERO}, (float32_t*)c_bytes, P); + + // Unpack the result back into a noun. + r_data = u3i_bytes(M*P, c_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + u3a_free(c_bytes); + + return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); + + case 6: + dgemm('N', 'N', M, N, P, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, N, (float64_t*)y_bytes, N, (float64_t){SB_REAL64_ZERO}, (float64_t*)c_bytes, P); + + // Unpack the result back into a noun. + r_data = u3i_bytes(M*P, c_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + u3a_free(c_bytes); + + return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); + + case 7: + qgemm('N', 'N', M, N, P, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, N, (float128_t*)y_bytes, N, (float128_t){SB_REAL128L_ZERO,SB_REAL128U_ZERO}, (float128_t*)c_bytes, P); + + // Unpack the result back into a noun. + r_data = u3i_bytes(M*P, c_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + u3a_free(c_bytes); + + return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); + + default: + u3a_free(x_bytes); + u3a_free(y_bytes); + u3a_free(c_bytes); + + return u3_none; + } + } + + u3_noun + u3wf_la_add(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun a_meta, x_data, + b_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &a_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &b_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, a_bloq, a_kind, a_fxp, + y_shape, b_bloq, b_kind, b_fxp, + rnd; + if ( c3n == u3r_mean(a_meta, + 2, &x_shape, + 6, &a_bloq, + 14, &a_kind, + 15, &a_fxp, + 0) || + c3n == u3r_mean(b_meta, + 2, &y_shape, + 6, &b_bloq, + 14, &b_kind, + 15, &b_fxp, + 0) || + c3n == u3r_sing(x_shape, y_shape) || + c3n == u3r_sing(a_bloq, b_bloq) || + c3n == u3r_sing(a_kind, b_kind) || + // fxp does not need to match so no check + c3n == u3r_mean(cor, 30, &rnd, 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (a_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_add_real(x_data, y_data, x_shape, a_bloq); + return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); + break; + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_sub(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun a_meta, x_data, + b_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &a_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &b_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, a_bloq, a_kind, a_fxp, + y_shape, b_bloq, b_kind, b_fxp, + rnd; + if ( c3n == u3r_mean(a_meta, + 2, &x_shape, + 6, &a_bloq, + 14, &a_kind, + 15, &a_fxp, + 0) || + c3n == u3r_mean(b_meta, + 2, &y_shape, + 6, &b_bloq, + 14, &b_kind, + 15, &b_fxp, + 0) || + c3n == u3r_sing(x_shape, y_shape) || + c3n == u3r_sing(a_bloq, b_bloq) || + c3n == u3r_sing(a_kind, b_kind) || + // fxp does not need to match so no check + c3n == u3r_mean(cor, 30, &rnd, 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (a_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_sub_real(x_data, y_data, x_shape, a_bloq); + return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); + break; + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_mul(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun a_meta, x_data, + b_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &a_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &b_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, a_bloq, a_kind, a_fxp, + y_shape, b_bloq, b_kind, b_fxp, + rnd; + if ( c3n == u3r_mean(a_meta, + 2, &x_shape, + 6, &a_bloq, + 14, &a_kind, + 15, &a_fxp, + 0) || + c3n == u3r_mean(b_meta, + 2, &y_shape, + 6, &b_bloq, + 14, &b_kind, + 15, &b_fxp, + 0) || + c3n == u3r_sing(x_shape, y_shape) || + c3n == u3r_sing(a_bloq, b_bloq) || + c3n == u3r_sing(a_kind, b_kind) || + // fxp does not need to match so no check + c3n == u3r_mean(cor, 30, &rnd, 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (a_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_mul_real(x_data, y_data, x_shape, a_bloq); + return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); + break; + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_div(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun a_meta, x_data, + b_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &a_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &b_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, a_bloq, a_kind, a_fxp, + y_shape, b_bloq, b_kind, b_fxp, + rnd; + if ( c3n == u3r_mean(a_meta, + 2, &x_shape, + 6, &a_bloq, + 14, &a_kind, + 15, &a_fxp, + 0) || + c3n == u3r_mean(b_meta, + 2, &y_shape, + 6, &b_bloq, + 14, &b_kind, + 15, &b_fxp, + 0) || + c3n == u3r_sing(x_shape, y_shape) || + c3n == u3r_sing(a_bloq, b_bloq) || + c3n == u3r_sing(a_kind, b_kind) || + // fxp does not need to match so no check + c3n == u3r_mean(cor, 30, &rnd, 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (a_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_div_real(x_data, y_data, x_shape, a_bloq); + return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); + break; + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_adds(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun a_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &a_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, a_bloq, a_kind, a_fxp, + rnd; + if ( c3n == u3r_mean(a_meta, + 2, &x_shape, + 6, &a_bloq, + 14, &a_kind, + 15, &a_fxp, + 0) || + // shape does not matter so no check + // bloq does not matter so no check + // kind does not matter so no check + // fxp does not need to match so no check + c3n == u3r_mean(cor, 30, &rnd, 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (a_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_adds_real(x_data, x_shape, a_bloq, n); + return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); + break; + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_subs(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun a_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &a_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, a_bloq, a_kind, a_fxp, + rnd; + if ( c3n == u3r_mean(a_meta, + 2, &x_shape, + 6, &a_bloq, + 14, &a_kind, + 15, &a_fxp, + 0) || + // shape does not matter so no check + // bloq does not matter so no check + // kind does not matter so no check + // fxp does not need to match so no check + c3n == u3r_mean(cor, 30, &rnd, 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (a_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_subs_real(x_data, x_shape, a_bloq, n); + return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); + break; + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_muls(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun a_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &a_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, a_bloq, a_kind, a_fxp, + rnd; + if ( c3n == u3r_mean(a_meta, + 2, &x_shape, + 6, &a_bloq, + 14, &a_kind, + 15, &a_fxp, + 0) || + // shape does not matter so no check + // bloq does not matter so no check + // kind does not matter so no check + // fxp does not need to match so no check + c3n == u3r_mean(cor, 30, &rnd, 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (a_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_muls_real(x_data, x_shape, a_bloq, n); + return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); + break; + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_divs(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun a_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &a_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, a_bloq, a_kind, a_fxp, + rnd; + if ( c3n == u3r_mean(a_meta, + 2, &x_shape, + 6, &a_bloq, + 14, &a_kind, + 15, &a_fxp, + 0) || + // shape does not matter so no check + // bloq does not matter so no check + // kind does not matter so no check + // fxp does not need to match so no check + c3n == u3r_mean(cor, 30, &rnd, 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (a_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_divs_real(x_data, x_shape, a_bloq, n); + return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); + break; + + default: + return u3_none; + } + } + } + } - // case c3__unum: - // return u3qf_la_add_unum(x_data, y_data, x_shape, a_bloq); + u3_noun + u3wf_la_mmul(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun a_meta, x_data, + b_meta, y_data; - // case c3__fixp: - // return u3qf_la_add_fixp(x_data, y_data, x_shape, a_bloq); + if ( c3n == u3r_mean(cor, + u3x_sam_4, &a_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &b_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, a_bloq, a_kind, a_fxp, + y_shape, b_bloq, b_kind, b_fxp, + rnd; + if ( c3n == u3r_mean(a_meta, + 2, &x_shape, + 6, &a_bloq, + 14, &a_kind, + 15, &a_fxp, + 0) || + c3n == u3r_mean(b_meta, + 2, &y_shape, + 6, &b_bloq, + 14, &b_kind, + 15, &b_fxp, + 0) || + c3n == u3r_sing(a_bloq, b_bloq) || + c3n == u3r_sing(a_kind, b_kind) || + // fxp does not need to match so no check + c3n == u3r_mean(cor, 30, &rnd, 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (a_kind) { + case c3__real: + _set_rounding(rnd); + return u3qf_la_mmul_real(x_data, y_data, x_shape, y_shape, a_bloq); + break; default: return u3_none; diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index 371cebb228..807f7bdcff 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -249,6 +249,12 @@ u3_noun u3qf_la_add_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_sub_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_mul_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_div_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_adds_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_subs_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_muls_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_divs_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_mmul_real(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); # define u3qfu_van_fan 28 diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index 035dad2305..cf82124dac 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2153,10 +2153,22 @@ static u3j_core _139_sep_d[] = */ static u3j_harm _139_hex__lagoon_add_a[] = {{".2", u3wf_la_add}, {}}; static u3j_harm _139_hex__lagoon_sub_a[] = {{".2", u3wf_la_sub}, {}}; +static u3j_harm _139_hex__lagoon_mul_a[] = {{".2", u3wf_la_mul}, {}}; +static u3j_harm _139_hex__lagoon_div_a[] = {{".2", u3wf_la_div}, {}}; +static u3j_harm _139_hex__lagoon_adds_a[] = {{".2", u3wf_la_adds}, {}}; +static u3j_harm _139_hex__lagoon_subs_a[] = {{".2", u3wf_la_subs}, {}}; +static u3j_harm _139_hex__lagoon_muls_a[] = {{".2", u3wf_la_muls}, {}}; +static u3j_harm _139_hex__lagoon_divs_a[] = {{".2", u3wf_la_divs}, {}}; static u3j_harm _139_hex__lagoon_mmul_a[] = {{".2", u3wf_la_mmul}, {}}; static u3j_core _139_hex__la_core_d[] = { { "add-rays", 7, _139_hex__lagoon_add_a, 0, no_hashes }, { "sub-rays", 7, _139_hex__lagoon_sub_a, 0, no_hashes }, + { "mul-rays", 7, _139_hex__lagoon_mul_a, 0, no_hashes }, + { "div-rays", 7, _139_hex__lagoon_div_a, 0, no_hashes }, + { "add-scal", 7, _139_hex__lagoon_adds_a, 0, no_hashes }, + { "sub-scal", 7, _139_hex__lagoon_subs_a, 0, no_hashes }, + { "mul-scal", 7, _139_hex__lagoon_muls_a, 0, no_hashes }, + { "div-scal", 7, _139_hex__lagoon_divs_a, 0, no_hashes }, { "mmul", 7, _139_hex__lagoon_mmul_a, 0, no_hashes }, {} }; diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index f7d4ef6a53..b833d2ca49 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -333,6 +333,12 @@ u3_noun u3wf_la_add(u3_noun); u3_noun u3wf_la_sub(u3_noun); + u3_noun u3wf_la_mul(u3_noun); + u3_noun u3wf_la_div(u3_noun); + u3_noun u3wf_la_adds(u3_noun); + u3_noun u3wf_la_subs(u3_noun); + u3_noun u3wf_la_muls(u3_noun); + u3_noun u3wf_la_divs(u3_noun); u3_noun u3wf_la_mmul(u3_noun); #endif /* ifndef U3_JETS_W_H */ From 0a42728a19bbb866f373cf4b93f8ad96dd2415bf Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 27 Mar 2024 12:10:48 -0500 Subject: [PATCH 10/97] Add trace/diag/dot --- pkg/noun/jets/f/lagoon.c | 299 +++++++++++++++++++++++++++++++++++++-- pkg/noun/jets/q.h | 3 + pkg/noun/jets/tree.c | 14 +- pkg/noun/jets/w.h | 3 + 4 files changed, 303 insertions(+), 16 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index ba55c5b4db..b6db643273 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -85,6 +85,23 @@ return len; } + static inline uint64_t* _get_dims(u3_noun shape) + { + uint64_t len = u3kb_lent(shape); + uint64_t* dims = (uint64_t*)u3a_malloc(len*sizeof(uint64_t)); + for (uint64_t i = 0; i < len; i++) { + dims[i] = u3h(shape); + shape = u3t(shape); + } + return dims; + } + + static inline size_t _get_array_length(uint64_t* array) + { + size_t n = sizeof(array)/sizeof(array[0]); + return n; + } + /* add - axpy = 1*x+y */ u3_noun @@ -611,7 +628,7 @@ } } -/* muls - x.*[n] +/* muls - ?scal n * x elementwise multiplication */ u3_noun @@ -691,8 +708,8 @@ } } -/* divs - x/[n] - elementwise multiplication +/* divs - ?scal 1/n * x + elementwise division */ u3_noun u3qf_la_divs_real(u3_noun x_data, @@ -717,7 +734,7 @@ // Switch on the block size. switch (bloq) { case 4: - u3r_bytes(0, 2, (uint8_t*)&n16, n); + u3r_bytes(0, 2, (uint8_t*)&(n16.v), n); n16 = f16_div((float16_t){SB_REAL16_ONE}, n16); hscal(len_a, n16, (float16_t*)x_bytes, 1); @@ -730,7 +747,7 @@ return r_data; case 5: - u3r_bytes(0, 4, (uint8_t*)&n32, n); + u3r_bytes(0, 4, (uint8_t*)&(n32.v), n); n32 = f32_div((float32_t){SB_REAL32_ONE}, n32); sscal(len_a, n32, (float32_t*)x_bytes, 1); @@ -743,7 +760,7 @@ return r_data; case 6: - u3r_bytes(0, 8, (uint8_t*)&n64, n); + u3r_bytes(0, 8, (uint8_t*)&(n64.v), n); n64 = f64_div((float64_t){SB_REAL64_ONE}, n64); dscal(len_a, n64, (float64_t*)x_bytes, 1); @@ -756,13 +773,8 @@ return r_data; case 7: - // u3r_bytes(0, 16, (uint8_t*)&(n128.v[0]), n); - u3l_log("divs: n", n); - u3r_bytes(0, 16, (uint8_t*)&n128, n); - fprintf(stderr, "n128: %lx %lx\r\n", n128.v[0], n128.v[1]); + u3r_bytes(0, 16, (uint8_t*)&(n128.v[0]), n); f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}), &n128, &n128); - fprintf(stderr, "one: %lx %lx\r\n", SB_REAL128L_ONE, SB_REAL128U_ONE); - fprintf(stderr, "n128: %lx %lx\r\n", n128.v[0], n128.v[1]); qscal(len_a, n128, (float128_t*)x_bytes, 1); // Unpack the result back into a noun. @@ -780,6 +792,130 @@ } } +/* dot - ?dot = x · y +*/ + u3_noun + u3qf_la_dot_real(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Unpack the data as a byte array. We assume total length < 2**64. + uint64_t len_a = _get_length(shape); + uint64_t siz_a = len_a * pow(2, bloq - 3); + uint8_t* x_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); + u3r_bytes(0, siz_a, x_bytes, x_data); + uint8_t* y_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); + u3r_bytes(0, siz_a+1, y_bytes, y_data); + + u3_noun r_data; + + // Switch on the block size. + switch (bloq) { + case 4: + hdot(len_a, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + case 5: + sdot(len_a, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + case 6: + ddot(len_a, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + case 7: + qdot(len_a, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); + + // Clean up. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + + default: + u3a_free(x_bytes); + u3a_free(y_bytes); + + return u3_none; + } + } + +/* diag - diag(x) +*/ + u3_noun + u3qf_la_diag(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Unpack shape into an array of dimensions. + uint64_t* dims = _get_dims(shape); + // Assert length of dims is 2. + assert(dims[0] == dims[1]); + assert(_get_array_length(dims) == 2); + + // Unpack the data as a byte array. We assume total length < 2**64. + uint64_t len_a = _get_length(shape); + uint64_t siz_a = len_a * pow(2, bloq - 3); + uint8_t* x_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); + u3r_bytes(0, siz_a, x_bytes, x_data); + uint8_t* y_bytes = (uint8_t*)u3a_malloc((dims[0]*dims[1]+1)*sizeof(uint8_t)); + + u3_noun r_data; + + for (uint64_t i = 0; i < dims[0]; i++) { + y_bytes[i] = x_bytes[i*dims[0] + i]; + } + y_bytes[dims[0]*dims[1]] = 1; // pin head + + // Unpack the result back into a noun. + r_data = u3i_bytes((dims[0]*dims[1]+1)*sizeof(uint8_t), y_bytes); + + u3a_free(x_bytes); + u3a_free(y_bytes); + u3a_free(dims); + + return r_data; + } + +/* trace - tr(x) +*/ + u3_noun + u3qf_la_trace_real(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + u3_noun diag_data = u3qf_la_diag(x_data, shape, bloq); + return u3qf_la_dot_real(diag_data, diag_data, shape, bloq); + } + /* mmul */ u3_noun @@ -1283,6 +1419,145 @@ } } + u3_noun + u3wf_la_dot(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun a_meta, x_data, + b_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &a_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &b_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, a_bloq, a_kind, a_fxp, + y_shape, b_bloq, b_kind, b_fxp, + rnd; + if ( c3n == u3r_mean(a_meta, + 2, &x_shape, + 6, &a_bloq, + 14, &a_kind, + 15, &a_fxp, + 0) || + c3n == u3r_mean(b_meta, + 2, &y_shape, + 6, &b_bloq, + 14, &b_kind, + 15, &b_fxp, + 0) || + c3n == u3r_sing(x_shape, y_shape) || + c3n == u3r_sing(a_bloq, b_bloq) || + c3n == u3r_sing(a_kind, b_kind) || + c3n == u3r_sing(a_fxp, b_fxp) || + c3n == u3r_mean(cor, 30, &rnd, 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (a_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_dot_real(x_data, y_data, x_shape, a_bloq); + return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); + break; + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_diag(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun a_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &a_meta, + u3x_sam_5, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, a_bloq, a_kind, a_fxp, + rnd; + if ( c3n == u3r_mean(a_meta, + 2, &x_shape, + 6, &a_bloq, + 14, &a_kind, + 15, &a_fxp, + 0) + // c3n == u3r_sing(x_shape, y_shape) || + // c3n == u3r_sing(a_bloq, b_bloq) || + // c3n == u3r_sing(a_kind, b_kind) || + // c3n == u3r_sing(a_fxp, b_fxp) || + // c3n == u3r_mean(cor, 30, &rnd, 0) + ) + { + return u3m_bail(c3__exit); + } else { + u3_noun r_data = u3qf_la_diag(x_data, x_shape, a_bloq); + return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); + } + } + } + + u3_noun + u3wf_la_trace(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun a_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &a_meta, + u3x_sam_5, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, a_bloq, a_kind, a_fxp, + rnd; + if ( c3n == u3r_mean(a_meta, + 2, &x_shape, + 6, &a_bloq, + 14, &a_kind, + 15, &a_fxp, + 0) + // c3n == u3r_sing(x_shape, y_shape) || + // c3n == u3r_sing(a_bloq, b_bloq) || + // c3n == u3r_sing(a_kind, b_kind) || + // c3n == u3r_sing(a_fxp, b_fxp) || + // c3n == u3r_mean(cor, 30, &rnd, 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (a_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_trace_real(x_data, x_shape, a_bloq); + uint64_t len_x0 = _get_dims(x_shape)[0]; + return u3nc(u3nq(len_x0, a_bloq, a_kind, a_fxp), r_data); + break; + + default: + return u3_none; + } + } + } + } + u3_noun u3wf_la_mmul(u3_noun cor) { diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index 807f7bdcff..db84af39d1 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -255,6 +255,9 @@ u3_noun u3qf_la_subs_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_muls_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_divs_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_dot_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_diag(u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_trace_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_mmul_real(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); # define u3qfu_van_fan 28 diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index cf82124dac..084ba01e98 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2151,14 +2151,17 @@ static u3j_core _139_hex_json_d[] = XX move to outer _sep_ core for /lib? eventually static u3j_core _139_sep_d[] = */ -static u3j_harm _139_hex__lagoon_add_a[] = {{".2", u3wf_la_add}, {}}; -static u3j_harm _139_hex__lagoon_sub_a[] = {{".2", u3wf_la_sub}, {}}; -static u3j_harm _139_hex__lagoon_mul_a[] = {{".2", u3wf_la_mul}, {}}; -static u3j_harm _139_hex__lagoon_div_a[] = {{".2", u3wf_la_div}, {}}; +static u3j_harm _139_hex__lagoon_add_a[] = {{".2", u3wf_la_add}, {}}; +static u3j_harm _139_hex__lagoon_sub_a[] = {{".2", u3wf_la_sub}, {}}; +static u3j_harm _139_hex__lagoon_mul_a[] = {{".2", u3wf_la_mul}, {}}; +static u3j_harm _139_hex__lagoon_div_a[] = {{".2", u3wf_la_div}, {}}; static u3j_harm _139_hex__lagoon_adds_a[] = {{".2", u3wf_la_adds}, {}}; static u3j_harm _139_hex__lagoon_subs_a[] = {{".2", u3wf_la_subs}, {}}; static u3j_harm _139_hex__lagoon_muls_a[] = {{".2", u3wf_la_muls}, {}}; static u3j_harm _139_hex__lagoon_divs_a[] = {{".2", u3wf_la_divs}, {}}; +static u3j_harm _139_hex__lagoon_dot_a[] = {{".2", u3wf_la_dot}, {}}; +static u3j_harm _139_hex__lagoon_diag_a[] = {{".2", u3wf_la_diag}, {}}; +static u3j_harm _139_hex__lagoon_trace_a[]= {{".2", u3wf_la_trace}, {}}; static u3j_harm _139_hex__lagoon_mmul_a[] = {{".2", u3wf_la_mmul}, {}}; static u3j_core _139_hex__la_core_d[] = { { "add-rays", 7, _139_hex__lagoon_add_a, 0, no_hashes }, @@ -2169,6 +2172,9 @@ static u3j_core _139_hex__la_core_d[] = { "sub-scal", 7, _139_hex__lagoon_subs_a, 0, no_hashes }, { "mul-scal", 7, _139_hex__lagoon_muls_a, 0, no_hashes }, { "div-scal", 7, _139_hex__lagoon_divs_a, 0, no_hashes }, + { "dot", 7, _139_hex__lagoon_dot_a, 0, no_hashes }, + { "diag", 7, _139_hex__lagoon_diag_a, 0, no_hashes }, + { "trace", 7, _139_hex__lagoon_trace_a,0, no_hashes }, { "mmul", 7, _139_hex__lagoon_mmul_a, 0, no_hashes }, {} }; diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index b833d2ca49..2bd87e6551 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -339,6 +339,9 @@ u3_noun u3wf_la_subs(u3_noun); u3_noun u3wf_la_muls(u3_noun); u3_noun u3wf_la_divs(u3_noun); + u3_noun u3wf_la_dot(u3_noun); + u3_noun u3wf_la_diag(u3_noun); + u3_noun u3wf_la_trace(u3_noun); u3_noun u3wf_la_mmul(u3_noun); #endif /* ifndef U3_JETS_W_H */ From ebe6dd381397b909cf29f28954dda1d11ac7a737 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 27 Mar 2024 20:36:30 -0500 Subject: [PATCH 11/97] WIP shape+diag error --- pkg/noun/jets/f/lagoon.c | 72 +++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 27 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index b6db643273..baaac63331 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -73,32 +73,42 @@ } } -/* shape +/* length of shape = x * y * z * w * ... */ - static inline uint64_t _get_length(u3_noun shape) + static inline c3_d _get_length(u3_noun shape) { - uint64_t len = 1; + c3_d len = 1; while (u3_nul != shape) { - len = len * u3h(shape); + len = len * u3x_atom(u3h(shape)); shape = u3t(shape); } return len; } - static inline uint64_t* _get_dims(u3_noun shape) +/* get dims from shape as array [x y z w ...] +*/ + static inline c3_d* _get_dims(u3_noun shape) { - uint64_t len = u3kb_lent(shape); - uint64_t* dims = (uint64_t*)u3a_malloc(len*sizeof(uint64_t)); - for (uint64_t i = 0; i < len; i++) { - dims[i] = u3h(shape); + u3_atom len = u3qb_lent(shape); + c3_d len_d = u3r_chub(0, len); + c3_d* dims = (c3_d*)u3a_malloc(len_d*sizeof(c3_d)); + for (uint64_t i = 0; i < len_d; i++) { + dims[i] = u3r_chub(0, u3x_atom(u3h(shape))); shape = u3t(shape); } + u3z(len); return dims; } +/* +*/ static inline size_t _get_array_length(uint64_t* array) { size_t n = sizeof(array)/sizeof(array[0]); + for (size_t i = 0; i < n; i++) { + fprintf(stderr, "%x ", array[i]); + } + fprintf(stderr, " => %x \n", n); return n; } @@ -434,7 +444,9 @@ { // Unpack the data as a byte array. We assume total length < 2**64. uint64_t len_a = _get_length(shape); + fprintf(stderr, "len_a: %d 0x%x units\r\n", len_a, len_a); uint64_t siz_a = len_a * pow(2, bloq - 3); + fprintf(stderr, "siz_a: %d 0x%x bytes\r\n", siz_a, siz_a); uint8_t* x_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); u3r_bytes(0, siz_a, x_bytes, x_data); uint8_t* y_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); @@ -875,28 +887,33 @@ u3_noun shape, u3_noun bloq) { - // Unpack shape into an array of dimensions. - uint64_t* dims = _get_dims(shape); // Assert length of dims is 2. + assert(u3qb_lent(shape) == 2); + // Unpack shape into an array of dimensions. + uint64_t *dims = _get_dims(shape); assert(dims[0] == dims[1]); - assert(_get_array_length(dims) == 2); // Unpack the data as a byte array. We assume total length < 2**64. uint64_t len_a = _get_length(shape); uint64_t siz_a = len_a * pow(2, bloq - 3); + uint64_t stride = dims[0] * pow(2, bloq - 3); uint8_t* x_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); - u3r_bytes(0, siz_a, x_bytes, x_data); - uint8_t* y_bytes = (uint8_t*)u3a_malloc((dims[0]*dims[1]+1)*sizeof(uint8_t)); + u3r_bytes(0, siz_a+1, x_bytes, x_data); + uint64_t siz_b = stride * dims[1]; + uint8_t* y_bytes = (uint8_t*)u3a_malloc((siz_b+1)*sizeof(uint8_t)); u3_noun r_data; - for (uint64_t i = 0; i < dims[0]; i++) { - y_bytes[i] = x_bytes[i*dims[0] + i]; + for (uint64_t i = 0; i < dims[1]; i++) { + for (uint64_t j = 0; j < stride; j++) { + fprintf(stderr, "i*s+j = %d*%d+%d = %d // x_bytes[i]: %lx\r\n", i, stride, j, i*stride+j, x_bytes[i*stride+j + i]); + y_bytes[i*stride+j] = x_bytes[i*stride+j + i]; + } } - y_bytes[dims[0]*dims[1]] = 1; // pin head + y_bytes[siz_b] = 1; // pin head // Unpack the result back into a noun. - r_data = u3i_bytes((dims[0]*dims[1]+1)*sizeof(uint8_t), y_bytes); + r_data = u3i_bytes((siz_b+1)*sizeof(uint8_t), y_bytes); u3a_free(x_bytes); u3a_free(y_bytes); @@ -913,7 +930,8 @@ u3_noun bloq) { u3_noun diag_data = u3qf_la_diag(x_data, shape, bloq); - return u3qf_la_dot_real(diag_data, diag_data, shape, bloq); + uint64_t len_x0 = _get_dims(shape)[0]; + return u3qf_la_dot_real(diag_data, diag_data, u3nt(len_x0, 0x1, u3_nul), bloq); } /* mmul @@ -1053,7 +1071,7 @@ case c3__real: _set_rounding(rnd); u3_noun r_data = u3qf_la_add_real(x_data, y_data, x_shape, a_bloq); - return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); + return u3nc(u3nq(y_shape, a_bloq, a_kind, a_fxp), r_data); break; default: @@ -1482,8 +1500,8 @@ u3_noun a_meta, x_data; if ( c3n == u3r_mean(cor, - u3x_sam_4, &a_meta, - u3x_sam_5, &x_data, + u3x_sam_2, &a_meta, + u3x_sam_3, &x_data, 0) || c3n == u3ud(x_data) ) { @@ -1507,7 +1525,8 @@ return u3m_bail(c3__exit); } else { u3_noun r_data = u3qf_la_diag(x_data, x_shape, a_bloq); - return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); + uint64_t len_x0 = _get_dims(x_shape)[0]; + return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), a_bloq, a_kind, a_fxp), r_data); } } } @@ -1519,8 +1538,8 @@ u3_noun a_meta, x_data; if ( c3n == u3r_mean(cor, - u3x_sam_4, &a_meta, - u3x_sam_5, &x_data, + u3x_sam_2, &a_meta, + u3x_sam_3, &x_data, 0) || c3n == u3ud(x_data) ) { @@ -1547,8 +1566,7 @@ case c3__real: _set_rounding(rnd); u3_noun r_data = u3qf_la_trace_real(x_data, x_shape, a_bloq); - uint64_t len_x0 = _get_dims(x_shape)[0]; - return u3nc(u3nq(len_x0, a_bloq, a_kind, a_fxp), r_data); + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), a_bloq, a_kind, a_fxp), r_data); break; default: From b59f33f9870373dce53c38628943faa219e6aa90 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Fri, 12 Apr 2024 17:10:08 -0500 Subject: [PATCH 12/97] Bump SoftBLAS to fix memory error and add progress on jets. --- WORKSPACE.bazel | 2 +- pkg/noun/jets/f/lagoon.c | 1557 ++++++++++++++++++-------------------- pkg/noun/jets/q.h | 1 + pkg/noun/jets/tree.c | 2 + pkg/noun/jets/w.h | 1 + 5 files changed, 761 insertions(+), 802 deletions(-) diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index 3b0296fd0f..de3564b253 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -297,7 +297,7 @@ versioned_http_archive( strip_prefix = "SoftBLAS-{version}", # sha256 = "", url = "https://github.com/urbit/SoftBLAS/archive/{version}.tar.gz", - version = "bace30db3944c0f2bb2b6cac0db9965675ad842e", + version = "3af44d8cbf0d61e31946af9127099257160d0451", ) versioned_http_archive( diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index baaac63331..84c58bd191 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -38,8 +38,7 @@ // We could use SoftBLAS set_rounding() to set the SoftFloat // mode as well, but it's more explicit to do it here since // we may use SoftFloat in any given Lagoon jet and we want - // you, dear developer, to see this set here. - fprintf(stderr, "%x %c\n", a, a); + // you, dear developer, to see it set here. switch ( a ) { default: @@ -92,7 +91,7 @@ u3_atom len = u3qb_lent(shape); c3_d len_d = u3r_chub(0, len); c3_d* dims = (c3_d*)u3a_malloc(len_d*sizeof(c3_d)); - for (uint64_t i = 0; i < len_d; i++) { + for (c3_d i = 0; i < len_d; i++) { dims[i] = u3r_chub(0, u3x_atom(u3h(shape))); shape = u3t(shape); } @@ -102,7 +101,7 @@ /* */ - static inline size_t _get_array_length(uint64_t* array) + static inline size_t _get_array_length(c3_d* array) { size_t n = sizeof(array)/sizeof(array[0]); for (size_t i = 0; i < n; i++) { @@ -118,74 +117,56 @@ u3qf_la_add_real(u3_noun x_data, u3_noun y_data, u3_noun shape, - u3_noun bloq) + u3_noun bloq + ) { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + // Unpack the data as a byte array. We assume total length < 2**64. - uint64_t len_a = _get_length(shape); - uint64_t siz_a = len_a * pow(2, bloq - 3); - uint8_t* x_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); - u3r_bytes(0, siz_a, x_bytes, x_data); - uint8_t* y_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); - u3r_bytes(0, siz_a+1, y_bytes, y_data); + // len_x is length in base units + c3_d len_x = _get_length(shape); - u3_noun r_data; + // siz_x is length in bytes + c3_d siz_x = len_x * pow(2, bloq-3); + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); + u3r_bytes(0, siz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); + u3r_bytes(0, siz_x+1, y_bytes, y_data); + // Switch on the block size. switch (bloq) { case 4: - haxpy(len_a, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + haxpy(len_x, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + break; case 5: - saxpy(len_a, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + saxpy(len_x, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + break; case 6: - daxpy(len_a, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + daxpy(len_x, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + break; case 7: - qaxpy(len_a, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); + qaxpy(len_x, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + break; + } - return r_data; + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); - default: - u3a_free(x_bytes); - u3a_free(y_bytes); + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); - return u3_none; - } + return r_data; } /* sub - axpy = -1*y+x @@ -194,76 +175,59 @@ u3qf_la_sub_real(u3_noun x_data, u3_noun y_data, u3_noun shape, - u3_noun bloq) + u3_noun bloq + ) { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + // Unpack the data as a byte array. We assume total length < 2**64. - uint64_t len_a = _get_length(shape); - uint64_t siz_a = len_a * pow(2, bloq - 3); - uint8_t* x_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); - u3r_bytes(0, siz_a, x_bytes, y_data); // XXX - uint8_t* y_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); - u3r_bytes(0, siz_a+1, y_bytes, x_data); // XXX + // len_x is length in base units + c3_d len_x = _get_length(shape); - u3_noun r_data; + // siz_x is length in bytes + c3_d siz_x = len_x * pow(2, bloq-3); + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); + u3r_bytes(0, siz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); + u3r_bytes(0, siz_x+1, y_bytes, y_data); + // Switch on the block size. switch (bloq) { case 4: - haxpy(len_a, (float16_t){SB_REAL16_NEGONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + haxpy(len_x, (float16_t){SB_REAL16_NEGONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + break; case 5: - saxpy(len_a, (float32_t){SB_REAL32_NEGONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + saxpy(len_x, (float32_t){SB_REAL32_NEGONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + break; case 6: - daxpy(len_a, (float64_t){SB_REAL64_NEGONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + daxpy(len_x, (float64_t){SB_REAL64_NEGONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + break; case 7: - qaxpy(len_a, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); + qaxpy(len_x, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + break; + } - return r_data; + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); - default: - u3a_free(x_bytes); - u3a_free(y_bytes); + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); - return u3_none; - } + return r_data; } + /* mul - x.*y elementwise multiplication */ @@ -273,80 +237,61 @@ u3_noun shape, u3_noun bloq) { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + // Unpack the data as a byte array. We assume total length < 2**64. - uint64_t len_a = _get_length(shape); - uint64_t siz_a = len_a * pow(2, bloq - 3); - uint8_t* x_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); - u3r_bytes(0, siz_a, x_bytes, x_data); - uint8_t* y_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); - u3r_bytes(0, siz_a+1, y_bytes, y_data); + // len_x is length in base units + c3_d len_x = _get_length(shape); - u3_noun r_data; + // siz_x is length in bytes + c3_d siz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); + u3r_bytes(0, siz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); + u3r_bytes(0, siz_x+1, y_bytes, y_data); // Switch on the block size. switch (bloq) { case 4: - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { ((float16_t*)y_bytes)[i] = f16_mul(((float16_t*)x_bytes)[i], ((float16_t*)y_bytes)[i]); } - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + break; case 5: - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { ((float32_t*)y_bytes)[i] = f32_mul(((float32_t*)x_bytes)[i], ((float32_t*)y_bytes)[i]); } - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + break; case 6: - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { ((float64_t*)y_bytes)[i] = f64_mul(((float64_t*)x_bytes)[i], ((float64_t*)y_bytes)[i]); } - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + break; case 7: - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { f128M_mul(&(((float128_t*)y_bytes)[i]), &(((float128_t*)x_bytes)[i]), &(((float128_t*)y_bytes)[i])); } + break; + } - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); - default: - u3a_free(x_bytes); - u3a_free(y_bytes); + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); - return u3_none; - } + return r_data; } /* div - x/y @@ -358,286 +303,347 @@ u3_noun shape, u3_noun bloq) { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + // Unpack the data as a byte array. We assume total length < 2**64. - uint64_t len_a = _get_length(shape); - uint64_t siz_a = len_a * pow(2, bloq - 3); - uint8_t* x_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); - u3r_bytes(0, siz_a, x_bytes, x_data); - uint8_t* y_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); - u3r_bytes(0, siz_a+1, y_bytes, y_data); + // len_x is length in base units + c3_d len_x = _get_length(shape); - u3_noun r_data; + // siz_x is length in bytes + c3_d siz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); + u3r_bytes(0, siz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); + u3r_bytes(0, siz_x+1, y_bytes, y_data); // Switch on the block size. switch (bloq) { case 4: - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { ((float16_t*)y_bytes)[i] = f16_div(((float16_t*)x_bytes)[i], ((float16_t*)y_bytes)[i]); } - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + break; case 5: - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { ((float32_t*)y_bytes)[i] = f32_div(((float32_t*)x_bytes)[i], ((float32_t*)y_bytes)[i]); } - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + break; case 6: - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { ((float64_t*)y_bytes)[i] = f64_div(((float64_t*)x_bytes)[i], ((float64_t*)y_bytes)[i]); } - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + break; case 7: - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { f128M_div(&(((float128_t*)y_bytes)[i]), &(((float128_t*)x_bytes)[i]), &(((float128_t*)y_bytes)[i])); } + break; + } - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); - return r_data; + return r_data; + } - default: - u3a_free(x_bytes); - u3a_free(y_bytes); +/* mod - x % y = x - r*floor(x/r) + remainder after division +*/ + u3_noun + u3qf_la_mod_real(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq, + u3_noun rnd) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } - return u3_none; + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // siz_x is length in bytes + c3_d siz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); + u3r_bytes(0, siz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); + u3r_bytes(0, siz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (bloq) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + float16_t y_val16 = ((float16_t*)y_bytes)[i]; + // Perform division x/n + float16_t div_result16 = f16_div(x_val16, y_val16); + // Compute floor of the division result + int64_t floor_result16 = f16_to_i64(div_result16, rnd, false); + float16_t floor_float16 = i64_to_f16(floor_result16); + // Multiply n by floor(x/n) + float16_t mult_result16 = f16_mul(y_val16, floor_float16); + // Compute remainder: x - n * floor(x/n) + ((float16_t*)y_bytes)[i] = f16_sub(x_val16, mult_result16); + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + float32_t y_val32 = ((float32_t*)y_bytes)[i]; + // Perform division x/n + float32_t div_result32 = f32_div(x_val32, y_val32); + // Compute floor of the division result + int64_t floor_result32 = f32_to_i64(div_result32, rnd, false); + float32_t floor_float32 = i64_to_f32(floor_result32); + // Multiply n by floor(x/n) + float32_t mult_result32 = f32_mul(y_val32, floor_float32); + // Compute remainder: x - n * floor(x/n) + ((float32_t*)y_bytes)[i] = f32_sub(x_val32, mult_result32); + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + float64_t y_val64 = ((float64_t*)y_bytes)[i]; + // Perform division x/n + float64_t div_result64 = f64_div(x_val64, y_val64); + // Compute floor of the division result + int64_t floor_result64 = f64_to_i64(div_result64, rnd, false); + float64_t floor_float64 = i64_to_f64(floor_result64); + // Multiply n by floor(x/n) + float64_t mult_result64 = f64_mul(y_val64, floor_float64); + // Compute remainder: x - n * floor(x/n) + ((float64_t*)y_bytes)[i] = f64_sub(x_val64, mult_result64); + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + float128_t y_val128 = ((float128_t*)y_bytes)[i]; + fprintf(stderr, "x_val128: %llx %llx\r\n", x_val128.v[0], x_val128.v[1]); + fprintf(stderr, "y_val128: %llx %llx\r\n", y_val128.v[0], y_val128.v[1]); + // Perform division x/n + float128_t div_result128; + // float128_t div_result128 = f128_div(x_val128, y_val128); + f128M_div((float128_t*)&x_val128, (float128_t*)&y_val128, (float128_t*)&div_result128); + fprintf(stderr, "div_result128: %llx %llx\r\n", div_result128.v[0], div_result128.v[1]); + // Compute floor of the division result + int64_t floor_result128 = f128_to_i64(div_result128, softfloat_round_minMag, false); + fprintf(stderr, "floor_result128: %llx\r\n", floor_result128); + float128_t floor_float128 = i64_to_f128(floor_result128); + fprintf(stderr, "floor_float128: %llx %llx\r\n", floor_float128.v[0], floor_float128.v[1]); + // Multiply n by floor(x/n) + float128_t mult_result128; + // float128_t mult_result128 = f128_mul(y_val128, floor_float128); + f128M_mul(((float128_t*)&y_val128), ((float128_t*)&floor_float128), ((float128_t*)&mult_result128)); + fprintf(stderr, "mult_result128: %llx %llx\r\n", mult_result128.v[0], mult_result128.v[1]); + // Compute remainder: x - n * floor(x/n) + // ((float128_t*)y_bytes)[i] = f128_sub(x_val128, mult_result128); + f128M_div(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)y_bytes)[i])); + fprintf(stderr, "y_bytes: %llx %llx\r\n", ((float128_t*)y_bytes)[i].v[0], ((float128_t*)y_bytes)[i].v[1]); + } + // for (c3_d i = 0; i < len_x; i++) { + // f128M_div(&(((float128_t*)y_bytes)[i]), &(((float128_t*)x_bytes)[i]), &(((float128_t*)y_bytes)[i])); + // } + break; } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; } -/* adds - axpy = 1*x+n +/* adds - axpy = 1*x+[n] */ u3_noun u3qf_la_adds_real(u3_noun x_data, + u3_noun n, u3_noun shape, - u3_noun bloq, - u3_noun n) + u3_noun bloq) { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + // Unpack the data as a byte array. We assume total length < 2**64. - uint64_t len_a = _get_length(shape); - fprintf(stderr, "len_a: %d 0x%x units\r\n", len_a, len_a); - uint64_t siz_a = len_a * pow(2, bloq - 3); - fprintf(stderr, "siz_a: %d 0x%x bytes\r\n", siz_a, siz_a); - uint8_t* x_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); - u3r_bytes(0, siz_a, x_bytes, x_data); - uint8_t* y_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // siz_x is length in bytes + c3_d siz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); + u3r_bytes(0, siz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); float16_t n16; float32_t n32; float64_t n64; float128_t n128; - u3_noun r_data; - // Switch on the block size. We assume that n fits in the target block size; Hoon typecheck should prevent. switch (bloq) { case 4: - u3r_bytes(0, 2, (uint8_t*)&n16, n); + u3r_bytes(0, 2, (c3_y*)&n16, n); // set y to [n] - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { ((float16_t*)y_bytes)[i] = n16; } - y_bytes[siz_a] = 1; // pin head - haxpy(len_a, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + haxpy(len_x, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + break; case 5: - u3r_bytes(0, 4, (uint8_t*)&n32, n); + u3r_bytes(0, 4, (c3_y*)&n32, n); // set y to [n] - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { ((float32_t*)y_bytes)[i] = n32; } - y_bytes[siz_a] = 1; // pin head - saxpy(len_a, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + saxpy(len_x, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + break; case 6: - u3r_bytes(0, 8, (uint8_t*)&n64, n); + u3r_bytes(0, 8, (c3_y*)&n64, n); // set y to [n] - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { ((float64_t*)y_bytes)[i] = n64; } - y_bytes[siz_a] = 1; // pin head - daxpy(len_a, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + daxpy(len_x, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + break; case 7: - u3r_bytes(0, 16, (uint8_t*)&n128, n); + u3r_bytes(0, 16, (c3_y*)&n128, n); // set y to [n] - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { ((float128_t*)y_bytes)[i] = (float128_t){n128.v[0], n128.v[1]}; } - y_bytes[siz_a] = 1; // pin head - qaxpy(len_a, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); + qaxpy(len_x, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + break; + } - return r_data; + // r_data is the result noun of [data] + y_bytes[siz_x] = 1; // pin head + u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); - default: - u3a_free(x_bytes); - u3a_free(y_bytes); + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); - return u3_none; - } + return r_data; } -/* subs - axpy = -1*n+x +/* subs - axpy = -1*[n]+x */ u3_noun u3qf_la_subs_real(u3_noun x_data, + u3_noun n, u3_noun shape, - u3_noun bloq, - u3_noun n) + u3_noun bloq) { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + // Unpack the data as a byte array. We assume total length < 2**64. - uint64_t len_a = _get_length(shape); - uint64_t siz_a = len_a * pow(2, bloq - 3); - uint8_t* x_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); - u3r_bytes(0, siz_a+1, x_bytes, x_data); - uint8_t* y_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // siz_x is length in bytes + c3_d siz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* x_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); + u3r_bytes(0, siz_x, x_bytes, x_data); + + // y_bytes is the data array (w/o leading 0x1) + c3_y* y_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); float16_t n16; float32_t n32; float64_t n64; float128_t n128; - u3_noun r_data; - // Switch on the block size. We assume that n fits in the target block size; Hoon typecheck should prevent. switch (bloq) { case 4: - u3r_bytes(0, 2, (uint8_t*)&n16, n); + u3r_bytes(0, 2, (c3_y*)&n16, n); // set y to [n] - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { ((float16_t*)y_bytes)[i] = n16; } - haxpy(len_a, (float16_t){SB_REAL16_NEGONE}, (float16_t*)y_bytes, 1, (float16_t*)x_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + haxpy(len_x, (float16_t){SB_REAL16_NEGONE}, (float16_t*)y_bytes, 1, (float16_t*)x_bytes, 1); + break; case 5: - u3r_bytes(0, 4, (uint8_t*)&n32, n); + u3r_bytes(0, 4, (c3_y*)&n32, n); // set y to [n] - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { ((float32_t*)y_bytes)[i] = n32; } - saxpy(len_a, (float32_t){SB_REAL32_NEGONE}, (float32_t*)y_bytes, 1, (float32_t*)x_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + saxpy(len_x, (float32_t){SB_REAL32_NEGONE}, (float32_t*)y_bytes, 1, (float32_t*)x_bytes, 1); + break; case 6: - u3r_bytes(0, 8, (uint8_t*)&n64, n); + u3r_bytes(0, 8, (c3_y*)&n64, n); // set y to [n] - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { ((float64_t*)y_bytes)[i] = n64; } - daxpy(len_a, (float64_t){SB_REAL64_NEGONE}, (float64_t*)y_bytes, 1, (float64_t*)x_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + daxpy(len_x, (float64_t){SB_REAL64_NEGONE}, (float64_t*)y_bytes, 1, (float64_t*)x_bytes, 1); + break; case 7: - u3r_bytes(0, 16, (uint8_t*)&n128, n); + u3r_bytes(0, 16, (c3_y*)&n128, n); // set y to [n] - for (uint64_t i = 0; i < len_a; i++) { + for (c3_d i = 0; i < len_x; i++) { ((float128_t*)y_bytes)[i] = (float128_t){n128.v[0], n128.v[1]}; } - qaxpy(len_a, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)y_bytes, 1, (float128_t*)x_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); + qaxpy(len_x, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)y_bytes, 1, (float128_t*)x_bytes, 1); + break; + } - return r_data; + // r_data is the result noun of [data] + x_bytes[siz_x] = 1; // pin head + u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), x_bytes); - default: - u3a_free(x_bytes); - u3a_free(y_bytes); + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); - return u3_none; - } + return r_data; } /* muls - ?scal n * x @@ -645,79 +651,62 @@ */ u3_noun u3qf_la_muls_real(u3_noun x_data, + u3_noun n, u3_noun shape, - u3_noun bloq, - u3_noun n) + u3_noun bloq) { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + // Unpack the data as a byte array. We assume total length < 2**64. - uint64_t len_a = _get_length(shape); - uint64_t siz_a = len_a * pow(2, bloq - 3); - uint8_t* x_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); - u3r_bytes(0, siz_a, x_bytes, x_data); - x_bytes[siz_a] = 1; // pin head + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // siz_x is length in bytes + c3_d siz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* x_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); + u3r_bytes(0, siz_x, x_bytes, x_data); + x_bytes[siz_x] = 1; // pin head float16_t n16; float32_t n32; float64_t n64; float128_t n128; - u3_noun r_data; - // Switch on the block size. switch (bloq) { case 4: - u3r_bytes(0, 2, (uint8_t*)&n16, n); - hscal(len_a, n16, (float16_t*)x_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); - - // Clean up. - u3a_free(x_bytes); - - return r_data; + u3r_bytes(0, 2, (c3_y*)&n16, n); + hscal(len_x, n16, (float16_t*)x_bytes, 1); + break; case 5: - u3r_bytes(0, 4, (uint8_t*)&n32, n); - sscal(len_a, n32, (float32_t*)x_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); - - // Clean up. - u3a_free(x_bytes); - - return r_data; + u3r_bytes(0, 4, (c3_y*)&n32, n); + sscal(len_x, n32, (float32_t*)x_bytes, 1); + break; case 6: - u3r_bytes(0, 8, (uint8_t*)&n64, n); - dscal(len_a, n64, (float64_t*)x_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); - - // Clean up. - u3a_free(x_bytes); - - return r_data; + u3r_bytes(0, 8, (c3_y*)&n64, n); + dscal(len_x, n64, (float64_t*)x_bytes, 1); + break; case 7: - u3r_bytes(0, 16, (uint8_t*)&(n128.v[0]), n); - qscal(len_a, n128, (float128_t*)x_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); - - // Clean up. - u3a_free(x_bytes); + u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); + qscal(len_x, n128, (float128_t*)x_bytes, 1); + break; + } - return r_data; + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), x_bytes); - default: - u3a_free(x_bytes); + // Clean up and return. + u3a_free(x_bytes); - return u3_none; - } + return r_data; } /* divs - ?scal 1/n * x @@ -725,83 +714,66 @@ */ u3_noun u3qf_la_divs_real(u3_noun x_data, + u3_noun n, u3_noun shape, - u3_noun bloq, - u3_noun n) + u3_noun bloq) { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + // Unpack the data as a byte array. We assume total length < 2**64. - uint64_t len_a = _get_length(shape); - uint64_t siz_a = len_a * pow(2, bloq - 3); - uint8_t* x_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); - u3r_bytes(0, siz_a, x_bytes, x_data); - x_bytes[siz_a] = 1; // pin head + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // siz_x is length in bytes + c3_d siz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* x_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); + u3r_bytes(0, siz_x, x_bytes, x_data); + x_bytes[siz_x] = 1; // pin head float16_t n16; float32_t n32; float64_t n64; float128_t n128; - u3_noun r_data; - // Switch on the block size. switch (bloq) { case 4: - u3r_bytes(0, 2, (uint8_t*)&(n16.v), n); + u3r_bytes(0, 2, (c3_y*)&(n16.v), n); n16 = f16_div((float16_t){SB_REAL16_ONE}, n16); - hscal(len_a, n16, (float16_t*)x_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); - - // Clean up. - u3a_free(x_bytes); - - return r_data; + hscal(len_x, n16, (float16_t*)x_bytes, 1); + break; case 5: - u3r_bytes(0, 4, (uint8_t*)&(n32.v), n); + u3r_bytes(0, 4, (c3_y*)&(n32.v), n); n32 = f32_div((float32_t){SB_REAL32_ONE}, n32); - sscal(len_a, n32, (float32_t*)x_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); - - // Clean up. - u3a_free(x_bytes); - - return r_data; + sscal(len_x, n32, (float32_t*)x_bytes, 1); + break; case 6: - u3r_bytes(0, 8, (uint8_t*)&(n64.v), n); + u3r_bytes(0, 8, (c3_y*)&(n64.v), n); n64 = f64_div((float64_t){SB_REAL64_ONE}, n64); - dscal(len_a, n64, (float64_t*)x_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); - - // Clean up. - u3a_free(x_bytes); - - return r_data; + dscal(len_x, n64, (float64_t*)x_bytes, 1); + break; case 7: - u3r_bytes(0, 16, (uint8_t*)&(n128.v[0]), n); + u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}), &n128, &n128); - qscal(len_a, n128, (float128_t*)x_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), x_bytes); - - // Clean up. - u3a_free(x_bytes); + qscal(len_x, n128, (float128_t*)x_bytes, 1); + break; + } - return r_data; + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), x_bytes); - default: - u3a_free(x_bytes); + // Clean up and return. + u3a_free(x_bytes); - return u3_none; - } + return r_data; } /* dot - ?dot = x · y @@ -812,72 +784,53 @@ u3_noun shape, u3_noun bloq) { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + // Unpack the data as a byte array. We assume total length < 2**64. - uint64_t len_a = _get_length(shape); - uint64_t siz_a = len_a * pow(2, bloq - 3); - uint8_t* x_bytes = (uint8_t*)u3a_malloc(siz_a*sizeof(uint8_t)); - u3r_bytes(0, siz_a, x_bytes, x_data); - uint8_t* y_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); - u3r_bytes(0, siz_a+1, y_bytes, y_data); + // len_x is length in base units + c3_d len_x = _get_length(shape); - u3_noun r_data; + // siz_x is length in bytes + c3_d siz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); + u3r_bytes(0, siz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); + u3r_bytes(0, siz_x+1, y_bytes, y_data); // Switch on the block size. switch (bloq) { case 4: - hdot(len_a, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + hdot(len_x, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + break; case 5: - sdot(len_a, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + sdot(len_x, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + break; case 6: - ddot(len_a, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - - return r_data; + ddot(len_x, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + break; case 7: - qdot(len_a, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); - - // Unpack the result back into a noun. - r_data = u3i_bytes((siz_a+1)*sizeof(uint8_t), y_bytes); - - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); + qdot(len_x, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + break; + } - return r_data; + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); - default: - u3a_free(x_bytes); - u3a_free(y_bytes); + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); - return u3_none; - } + return r_data; } /* diag - diag(x) @@ -890,30 +843,30 @@ // Assert length of dims is 2. assert(u3qb_lent(shape) == 2); // Unpack shape into an array of dimensions. - uint64_t *dims = _get_dims(shape); + c3_d *dims = _get_dims(shape); assert(dims[0] == dims[1]); // Unpack the data as a byte array. We assume total length < 2**64. - uint64_t len_a = _get_length(shape); - uint64_t siz_a = len_a * pow(2, bloq - 3); - uint64_t stride = dims[0] * pow(2, bloq - 3); - uint8_t* x_bytes = (uint8_t*)u3a_malloc((siz_a+1)*sizeof(uint8_t)); - u3r_bytes(0, siz_a+1, x_bytes, x_data); - uint64_t siz_b = stride * dims[1]; - uint8_t* y_bytes = (uint8_t*)u3a_malloc((siz_b+1)*sizeof(uint8_t)); + c3_d len_x = _get_length(shape); + c3_d siz_x = len_x * pow(2, bloq - 3); + c3_d stride = dims[0] * pow(2, bloq - 3); + c3_y* x_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); + u3r_bytes(0, siz_x+1, x_bytes, x_data); + c3_d siz_y = stride * dims[1]; + c3_y* y_bytes = (c3_y*)u3a_malloc((siz_y+1)*sizeof(c3_y)); u3_noun r_data; - for (uint64_t i = 0; i < dims[1]; i++) { - for (uint64_t j = 0; j < stride; j++) { + for (c3_d i = 0; i < dims[1]; i++) { + for (c3_d j = 0; j < stride; j++) { fprintf(stderr, "i*s+j = %d*%d+%d = %d // x_bytes[i]: %lx\r\n", i, stride, j, i*stride+j, x_bytes[i*stride+j + i]); y_bytes[i*stride+j] = x_bytes[i*stride+j + i]; } } - y_bytes[siz_b] = 1; // pin head + y_bytes[siz_y] = 1; // pin head // Unpack the result back into a noun. - r_data = u3i_bytes((siz_b+1)*sizeof(uint8_t), y_bytes); + r_data = u3i_bytes((siz_y+1)*sizeof(c3_y), y_bytes); u3a_free(x_bytes); u3a_free(y_bytes); @@ -930,7 +883,7 @@ u3_noun bloq) { u3_noun diag_data = u3qf_la_diag(x_data, shape, bloq); - uint64_t len_x0 = _get_dims(shape)[0]; + c3_d len_x0 = _get_dims(shape)[0]; return u3qf_la_dot_real(diag_data, diag_data, u3nt(len_x0, 0x1, u3_nul), bloq); } @@ -944,21 +897,21 @@ u3_noun bloq) { // Unpack the data as a byte array. We assume total length < 2**64. - uint64_t M = u3h(x_shape); - uint64_t Na = u3h(u3t(x_shape)); - uint64_t Nb = u3h(y_shape); - uint64_t P = u3h(u3t(y_shape)); + c3_d M = u3h(x_shape); + c3_d Na = u3h(u3t(x_shape)); + c3_d Nb = u3h(y_shape); + c3_d P = u3h(u3t(y_shape)); assert(u3_nul == u3t(u3t(x_shape))); assert(Na == Nb); - uint64_t N = Na; + c3_d N = Na; assert(u3_nul == u3t(u3t(y_shape))); - uint8_t* x_bytes = (uint8_t*)u3a_malloc((M*N)*sizeof(uint8_t)); + c3_y* x_bytes = (c3_y*)u3a_malloc((M*N)*sizeof(c3_y)); u3r_bytes(0, M*N, x_bytes, x_data); - uint8_t* y_bytes = (uint8_t*)u3a_malloc((N*P)*sizeof(uint8_t)); + c3_y* y_bytes = (c3_y*)u3a_malloc((N*P)*sizeof(c3_y)); u3r_bytes(0, N*P, y_bytes, y_data); - uint8_t* c_bytes = (uint8_t*)u3a_malloc((M*P)*sizeof(uint8_t)); + c3_y* c_bytes = (c3_y*)u3a_malloc((M*P)*sizeof(c3_y)); u3_noun r_data; @@ -1029,13 +982,13 @@ u3wf_la_add(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, x_data, - b_meta, y_data; + u3_noun x_meta, x_data, + y_meta, y_data; if ( c3n == u3r_mean(cor, - u3x_sam_4, &a_meta, + u3x_sam_4, &x_meta, u3x_sam_5, &x_data, - u3x_sam_6, &b_meta, + u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || c3n == u3ud(x_data) || @@ -1043,36 +996,35 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, a_bloq, a_kind, a_fxp, - y_shape, b_bloq, b_kind, b_fxp, + u3_noun x_shape, x_bloq, x_kind, x_fxp, + y_shape, y_bloq, y_kind, y_fxp, rnd; - if ( c3n == u3r_mean(a_meta, - 2, &x_shape, - 6, &a_bloq, - 14, &a_kind, - 15, &a_fxp, - 0) || - c3n == u3r_mean(b_meta, - 2, &y_shape, - 6, &b_bloq, - 14, &b_kind, - 15, &b_fxp, - 0) || + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + y_shape = u3h(y_meta); // 2 + y_bloq = u3h(u3t(y_meta)); // 6 + y_kind = u3h(u3t(u3t(y_meta))); // 14 + y_fxp = u3t(u3t(u3t(y_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(y_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(y_kind) || c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(a_bloq, b_bloq) || - c3n == u3r_sing(a_kind, b_kind) || - // fxp does not need to match so no check - c3n == u3r_mean(cor, 30, &rnd, 0) + c3n == u3r_sing(x_bloq, y_bloq) || + c3n == u3r_sing(x_kind, y_kind) + // fxp does not need to match here so no check ) { return u3m_bail(c3__exit); } else { - switch (a_kind) { + switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_add_real(x_data, y_data, x_shape, a_bloq); - return u3nc(u3nq(y_shape, a_bloq, a_kind, a_fxp), r_data); - break; + u3_noun r_data = u3qf_la_add_real(x_data, y_data, x_shape, x_bloq); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: return u3_none; @@ -1084,14 +1036,14 @@ u3_noun u3wf_la_sub(u3_noun cor) { - // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, x_data, - b_meta, y_data; + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; if ( c3n == u3r_mean(cor, - u3x_sam_4, &a_meta, + u3x_sam_4, &x_meta, u3x_sam_5, &x_data, - u3x_sam_6, &b_meta, + u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || c3n == u3ud(x_data) || @@ -1099,36 +1051,35 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, a_bloq, a_kind, a_fxp, - y_shape, b_bloq, b_kind, b_fxp, + u3_noun x_shape, x_bloq, x_kind, x_fxp, + y_shape, y_bloq, y_kind, y_fxp, rnd; - if ( c3n == u3r_mean(a_meta, - 2, &x_shape, - 6, &a_bloq, - 14, &a_kind, - 15, &a_fxp, - 0) || - c3n == u3r_mean(b_meta, - 2, &y_shape, - 6, &b_bloq, - 14, &b_kind, - 15, &b_fxp, - 0) || + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + y_shape = u3h(y_meta); // 2 + y_bloq = u3h(u3t(y_meta)); // 6 + y_kind = u3h(u3t(u3t(y_meta))); // 14 + y_fxp = u3t(u3t(u3t(y_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(y_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(y_kind) || c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(a_bloq, b_bloq) || - c3n == u3r_sing(a_kind, b_kind) || - // fxp does not need to match so no check - c3n == u3r_mean(cor, 30, &rnd, 0) + c3n == u3r_sing(x_bloq, y_bloq) || + c3n == u3r_sing(x_kind, y_kind) + // fxp does not need to match here so no check ) { return u3m_bail(c3__exit); } else { - switch (a_kind) { + switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_sub_real(x_data, y_data, x_shape, a_bloq); - return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); - break; + u3_noun r_data = u3qf_la_sub_real(x_data, y_data, x_shape, x_bloq); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: return u3_none; @@ -1140,14 +1091,14 @@ u3_noun u3wf_la_mul(u3_noun cor) { - // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, x_data, - b_meta, y_data; + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; if ( c3n == u3r_mean(cor, - u3x_sam_4, &a_meta, + u3x_sam_4, &x_meta, u3x_sam_5, &x_data, - u3x_sam_6, &b_meta, + u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || c3n == u3ud(x_data) || @@ -1155,36 +1106,35 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, a_bloq, a_kind, a_fxp, - y_shape, b_bloq, b_kind, b_fxp, + u3_noun x_shape, x_bloq, x_kind, x_fxp, + y_shape, y_bloq, y_kind, y_fxp, rnd; - if ( c3n == u3r_mean(a_meta, - 2, &x_shape, - 6, &a_bloq, - 14, &a_kind, - 15, &a_fxp, - 0) || - c3n == u3r_mean(b_meta, - 2, &y_shape, - 6, &b_bloq, - 14, &b_kind, - 15, &b_fxp, - 0) || + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + y_shape = u3h(y_meta); // 2 + y_bloq = u3h(u3t(y_meta)); // 6 + y_kind = u3h(u3t(u3t(y_meta))); // 14 + y_fxp = u3t(u3t(u3t(y_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(y_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(y_kind) || c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(a_bloq, b_bloq) || - c3n == u3r_sing(a_kind, b_kind) || - // fxp does not need to match so no check - c3n == u3r_mean(cor, 30, &rnd, 0) + c3n == u3r_sing(x_bloq, y_bloq) || + c3n == u3r_sing(x_kind, y_kind) + // fxp does not need to match here so no check ) { return u3m_bail(c3__exit); } else { - switch (a_kind) { + switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_mul_real(x_data, y_data, x_shape, a_bloq); - return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); - break; + u3_noun r_data = u3qf_la_mul_real(x_data, y_data, x_shape, x_bloq); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: return u3_none; @@ -1196,14 +1146,14 @@ u3_noun u3wf_la_div(u3_noun cor) { - // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, x_data, - b_meta, y_data; + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; if ( c3n == u3r_mean(cor, - u3x_sam_4, &a_meta, + u3x_sam_4, &x_meta, u3x_sam_5, &x_data, - u3x_sam_6, &b_meta, + u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || c3n == u3ud(x_data) || @@ -1211,36 +1161,35 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, a_bloq, a_kind, a_fxp, - y_shape, b_bloq, b_kind, b_fxp, + u3_noun x_shape, x_bloq, x_kind, x_fxp, + y_shape, y_bloq, y_kind, y_fxp, rnd; - if ( c3n == u3r_mean(a_meta, - 2, &x_shape, - 6, &a_bloq, - 14, &a_kind, - 15, &a_fxp, - 0) || - c3n == u3r_mean(b_meta, - 2, &y_shape, - 6, &b_bloq, - 14, &b_kind, - 15, &b_fxp, - 0) || + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + y_shape = u3h(y_meta); // 2 + y_bloq = u3h(u3t(y_meta)); // 6 + y_kind = u3h(u3t(u3t(y_meta))); // 14 + y_fxp = u3t(u3t(u3t(y_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(y_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(y_kind) || c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(a_bloq, b_bloq) || - c3n == u3r_sing(a_kind, b_kind) || - // fxp does not need to match so no check - c3n == u3r_mean(cor, 30, &rnd, 0) + c3n == u3r_sing(x_bloq, y_bloq) || + c3n == u3r_sing(x_kind, y_kind) + // fxp does not need to match here so no check ) { return u3m_bail(c3__exit); } else { - switch (a_kind) { + switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_div_real(x_data, y_data, x_shape, a_bloq); - return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); - break; + u3_noun r_data = u3qf_la_div_real(x_data, y_data, x_shape, x_bloq); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: return u3_none; @@ -1250,44 +1199,52 @@ } u3_noun - u3wf_la_adds(u3_noun cor) + u3wf_la_mod(u3_noun cor) { - // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, x_data, n; + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; if ( c3n == u3r_mean(cor, - u3x_sam_4, &a_meta, + u3x_sam_4, &x_meta, u3x_sam_5, &x_data, - u3x_sam_3, &n, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, 0) || c3n == u3ud(x_data) || - c3n == u3ud(n) ) + c3n == u3ud(y_data) ) { return u3m_bail(c3__exit); } else { - u3_noun x_shape, a_bloq, a_kind, a_fxp, + u3_noun x_shape, x_bloq, x_kind, x_fxp, + y_shape, y_bloq, y_kind, y_fxp, rnd; - if ( c3n == u3r_mean(a_meta, - 2, &x_shape, - 6, &a_bloq, - 14, &a_kind, - 15, &a_fxp, - 0) || - // shape does not matter so no check - // bloq does not matter so no check - // kind does not matter so no check - // fxp does not need to match so no check - c3n == u3r_mean(cor, 30, &rnd, 0) + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + y_shape = u3h(y_meta); // 2 + y_bloq = u3h(u3t(y_meta)); // 6 + y_kind = u3h(u3t(u3t(y_meta))); // 14 + y_fxp = u3t(u3t(u3t(y_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(y_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(y_kind) || + c3n == u3r_sing(x_shape, y_shape) || + c3n == u3r_sing(x_bloq, y_bloq) || + c3n == u3r_sing(x_kind, y_kind) + // fxp does not need to match here so no check ) { return u3m_bail(c3__exit); } else { - switch (a_kind) { - case c3__real: - _set_rounding(rnd); - u3_noun r_data = u3qf_la_adds_real(x_data, x_shape, a_bloq, n); - return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); - break; + switch (x_kind) { + case c3__real: ; // XX satisfy label + // Global rounding mode is ignored by SoftFloat conversions so we pass it in. + u3_noun r_data = u3qf_la_mod_real(x_data, y_data, x_shape, x_bloq, rnd); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: return u3_none; @@ -1297,13 +1254,13 @@ } u3_noun - u3wf_la_subs(u3_noun cor) + u3wf_la_adds(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, x_data, n; + u3_noun x_meta, x_data, n; if ( c3n == u3r_mean(cor, - u3x_sam_4, &a_meta, + u3x_sam_4, &x_meta, u3x_sam_5, &x_data, u3x_sam_3, &n, 0) || @@ -1312,33 +1269,56 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, a_bloq, a_kind, a_fxp, + u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; - if ( c3n == u3r_mean(a_meta, - 2, &x_shape, - 6, &a_bloq, - 14, &a_kind, - 15, &a_fxp, - 0) || - // shape does not matter so no check - // bloq does not matter so no check - // kind does not matter so no check - // fxp does not need to match so no check - c3n == u3r_mean(cor, 30, &rnd, 0) - ) - { - return u3m_bail(c3__exit); - } else { - switch (a_kind) { - case c3__real: - _set_rounding(rnd); - u3_noun r_data = u3qf_la_subs_real(x_data, x_shape, a_bloq, n); - return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); - break; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + switch (x_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_adds_real(x_data, n, x_shape, x_bloq); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + + default: + return u3_none; + } + } + } - default: - return u3_none; - } + u3_noun + u3wf_la_subs(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + switch (x_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_subs_real(x_data, n, x_shape, x_bloq); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + + default: + return u3_none; } } } @@ -1347,10 +1327,10 @@ u3wf_la_muls(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, x_data, n; + u3_noun x_meta, x_data, n; if ( c3n == u3r_mean(cor, - u3x_sam_4, &a_meta, + u3x_sam_4, &x_meta, u3x_sam_5, &x_data, u3x_sam_3, &n, 0) || @@ -1359,33 +1339,21 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, a_bloq, a_kind, a_fxp, + u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; - if ( c3n == u3r_mean(a_meta, - 2, &x_shape, - 6, &a_bloq, - 14, &a_kind, - 15, &a_fxp, - 0) || - // shape does not matter so no check - // bloq does not matter so no check - // kind does not matter so no check - // fxp does not need to match so no check - c3n == u3r_mean(cor, 30, &rnd, 0) - ) - { - return u3m_bail(c3__exit); - } else { - switch (a_kind) { - case c3__real: - _set_rounding(rnd); - u3_noun r_data = u3qf_la_muls_real(x_data, x_shape, a_bloq, n); - return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); - break; - - default: - return u3_none; - } + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + switch (x_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_muls_real(x_data, n, x_shape, x_bloq); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + + default: + return u3_none; } } } @@ -1394,10 +1362,10 @@ u3wf_la_divs(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, x_data, n; + u3_noun x_meta, x_data, n; if ( c3n == u3r_mean(cor, - u3x_sam_4, &a_meta, + u3x_sam_4, &x_meta, u3x_sam_5, &x_data, u3x_sam_3, &n, 0) || @@ -1406,33 +1374,21 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, a_bloq, a_kind, a_fxp, + u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; - if ( c3n == u3r_mean(a_meta, - 2, &x_shape, - 6, &a_bloq, - 14, &a_kind, - 15, &a_fxp, - 0) || - // shape does not matter so no check - // bloq does not matter so no check - // kind does not matter so no check - // fxp does not need to match so no check - c3n == u3r_mean(cor, 30, &rnd, 0) - ) - { - return u3m_bail(c3__exit); - } else { - switch (a_kind) { - case c3__real: - _set_rounding(rnd); - u3_noun r_data = u3qf_la_divs_real(x_data, x_shape, a_bloq, n); - return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); - break; - - default: - return u3_none; - } + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + switch (x_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_divs_real(x_data, n, x_shape, x_bloq); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + + default: + return u3_none; } } } @@ -1441,13 +1397,13 @@ u3wf_la_dot(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, x_data, - b_meta, y_data; + u3_noun x_meta, x_data, + y_meta, y_data; if ( c3n == u3r_mean(cor, - u3x_sam_4, &a_meta, + u3x_sam_4, &x_meta, u3x_sam_5, &x_data, - u3x_sam_6, &b_meta, + u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || c3n == u3ud(x_data) || @@ -1455,36 +1411,35 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, a_bloq, a_kind, a_fxp, - y_shape, b_bloq, b_kind, b_fxp, + u3_noun x_shape, x_bloq, x_kind, x_fxp, + y_shape, y_bloq, y_kind, y_fxp, rnd; - if ( c3n == u3r_mean(a_meta, - 2, &x_shape, - 6, &a_bloq, - 14, &a_kind, - 15, &a_fxp, - 0) || - c3n == u3r_mean(b_meta, - 2, &y_shape, - 6, &b_bloq, - 14, &b_kind, - 15, &b_fxp, - 0) || + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + y_shape = u3h(y_meta); // 2 + y_bloq = u3h(u3t(y_meta)); // 6 + y_kind = u3h(u3t(u3t(y_meta))); // 14 + y_fxp = u3t(u3t(u3t(y_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(y_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(y_kind) || c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(a_bloq, b_bloq) || - c3n == u3r_sing(a_kind, b_kind) || - c3n == u3r_sing(a_fxp, b_fxp) || - c3n == u3r_mean(cor, 30, &rnd, 0) + c3n == u3r_sing(x_bloq, y_bloq) || + c3n == u3r_sing(x_kind, y_kind) || + c3n == u3r_sing(x_fxp, y_fxp) ) { return u3m_bail(c3__exit); } else { - switch (a_kind) { + switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_dot_real(x_data, y_data, x_shape, a_bloq); - return u3nc(u3nq(x_shape, a_bloq, a_kind, a_fxp), r_data); - break; + u3_noun r_data = u3qf_la_dot_real(x_data, y_data, x_shape, x_bloq); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: return u3_none; @@ -1497,36 +1452,36 @@ u3wf_la_diag(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, x_data; + u3_noun x_meta, x_data; if ( c3n == u3r_mean(cor, - u3x_sam_2, &a_meta, + u3x_sam_2, &x_meta, u3x_sam_3, &x_data, 0) || c3n == u3ud(x_data) ) { return u3m_bail(c3__exit); } else { - u3_noun x_shape, a_bloq, a_kind, a_fxp, + u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; - if ( c3n == u3r_mean(a_meta, + if ( c3n == u3r_mean(x_meta, 2, &x_shape, - 6, &a_bloq, - 14, &a_kind, - 15, &a_fxp, + 6, &x_bloq, + 14, &x_kind, + 15, &x_fxp, 0) // c3n == u3r_sing(x_shape, y_shape) || - // c3n == u3r_sing(a_bloq, b_bloq) || - // c3n == u3r_sing(a_kind, b_kind) || - // c3n == u3r_sing(a_fxp, b_fxp) || - // c3n == u3r_mean(cor, 30, &rnd, 0) + // c3n == u3r_sing(x_bloq, y_bloq) || + // c3n == u3r_sing(x_kind, y_kind) || + // c3n == u3r_sing(x_fxp, y_fxp) || + // c3n == u3r_mean(cor, u3x_con_sam, &rnd, 0) ) { return u3m_bail(c3__exit); } else { - u3_noun r_data = u3qf_la_diag(x_data, x_shape, a_bloq); - uint64_t len_x0 = _get_dims(x_shape)[0]; - return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), a_bloq, a_kind, a_fxp), r_data); + u3_noun r_data = u3qf_la_diag(x_data, x_shape, x_bloq); + c3_d len_x0 = _get_dims(x_shape)[0]; + return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), x_bloq, x_kind, x_fxp), r_data); } } } @@ -1535,38 +1490,38 @@ u3wf_la_trace(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, x_data; + u3_noun x_meta, x_data; if ( c3n == u3r_mean(cor, - u3x_sam_2, &a_meta, + u3x_sam_2, &x_meta, u3x_sam_3, &x_data, 0) || c3n == u3ud(x_data) ) { return u3m_bail(c3__exit); } else { - u3_noun x_shape, a_bloq, a_kind, a_fxp, + u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; - if ( c3n == u3r_mean(a_meta, + if ( c3n == u3r_mean(x_meta, 2, &x_shape, - 6, &a_bloq, - 14, &a_kind, - 15, &a_fxp, + 6, &x_bloq, + 14, &x_kind, + 15, &x_fxp, 0) // c3n == u3r_sing(x_shape, y_shape) || - // c3n == u3r_sing(a_bloq, b_bloq) || - // c3n == u3r_sing(a_kind, b_kind) || - // c3n == u3r_sing(a_fxp, b_fxp) || - // c3n == u3r_mean(cor, 30, &rnd, 0) + // c3n == u3r_sing(x_bloq, y_bloq) || + // c3n == u3r_sing(x_kind, y_kind) || + // c3n == u3r_sing(x_fxp, y_fxp) || + // c3n == u3r_mean(cor, u3x_con_sam, &rnd, 0) ) { return u3m_bail(c3__exit); } else { - switch (a_kind) { + switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_trace_real(x_data, x_shape, a_bloq); - return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), a_bloq, a_kind, a_fxp), r_data); + u3_noun r_data = u3qf_la_trace_real(x_data, x_shape, x_bloq); + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), x_bloq, x_kind, x_fxp), r_data); break; default: @@ -1580,13 +1535,13 @@ u3wf_la_mmul(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] - u3_noun a_meta, x_data, - b_meta, y_data; + u3_noun x_meta, x_data, + y_meta, y_data; if ( c3n == u3r_mean(cor, - u3x_sam_4, &a_meta, + u3x_sam_4, &x_meta, u3x_sam_5, &x_data, - u3x_sam_6, &b_meta, + u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || c3n == u3ud(x_data) || @@ -1594,33 +1549,33 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, a_bloq, a_kind, a_fxp, - y_shape, b_bloq, b_kind, b_fxp, + u3_noun x_shape, x_bloq, x_kind, x_fxp, + y_shape, y_bloq, y_kind, y_fxp, rnd; - if ( c3n == u3r_mean(a_meta, + if ( c3n == u3r_mean(x_meta, 2, &x_shape, - 6, &a_bloq, - 14, &a_kind, - 15, &a_fxp, + 6, &x_bloq, + 14, &x_kind, + 15, &x_fxp, 0) || - c3n == u3r_mean(b_meta, + c3n == u3r_mean(y_meta, 2, &y_shape, - 6, &b_bloq, - 14, &b_kind, - 15, &b_fxp, + 6, &y_bloq, + 14, &y_kind, + 15, &y_fxp, 0) || - c3n == u3r_sing(a_bloq, b_bloq) || - c3n == u3r_sing(a_kind, b_kind) || + c3n == u3r_sing(x_bloq, y_bloq) || + c3n == u3r_sing(x_kind, y_kind) || // fxp does not need to match so no check - c3n == u3r_mean(cor, 30, &rnd, 0) + c3n == u3r_mean(cor, u3x_con_sam, &rnd, 0) ) { return u3m_bail(c3__exit); } else { - switch (a_kind) { + switch (x_kind) { case c3__real: _set_rounding(rnd); - return u3qf_la_mmul_real(x_data, y_data, x_shape, y_shape, a_bloq); + return u3qf_la_mmul_real(x_data, y_data, x_shape, y_shape, x_bloq); break; default: diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index db84af39d1..63d0bb724e 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -251,6 +251,7 @@ u3_noun u3qf_la_sub_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_mul_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_div_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_mod_real(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_adds_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_subs_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_muls_real(u3_noun, u3_noun, u3_noun, u3_noun); diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index 084ba01e98..d8375dc4fc 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2155,6 +2155,7 @@ static u3j_harm _139_hex__lagoon_add_a[] = {{".2", u3wf_la_add}, {}}; static u3j_harm _139_hex__lagoon_sub_a[] = {{".2", u3wf_la_sub}, {}}; static u3j_harm _139_hex__lagoon_mul_a[] = {{".2", u3wf_la_mul}, {}}; static u3j_harm _139_hex__lagoon_div_a[] = {{".2", u3wf_la_div}, {}}; +static u3j_harm _139_hex__lagoon_mod_a[] = {{".2", u3wf_la_mod}, {}}; static u3j_harm _139_hex__lagoon_adds_a[] = {{".2", u3wf_la_adds}, {}}; static u3j_harm _139_hex__lagoon_subs_a[] = {{".2", u3wf_la_subs}, {}}; static u3j_harm _139_hex__lagoon_muls_a[] = {{".2", u3wf_la_muls}, {}}; @@ -2168,6 +2169,7 @@ static u3j_core _139_hex__la_core_d[] = { "sub-rays", 7, _139_hex__lagoon_sub_a, 0, no_hashes }, { "mul-rays", 7, _139_hex__lagoon_mul_a, 0, no_hashes }, { "div-rays", 7, _139_hex__lagoon_div_a, 0, no_hashes }, + { "mod-rays", 7, _139_hex__lagoon_mod_a, 0, no_hashes }, { "add-scal", 7, _139_hex__lagoon_adds_a, 0, no_hashes }, { "sub-scal", 7, _139_hex__lagoon_subs_a, 0, no_hashes }, { "mul-scal", 7, _139_hex__lagoon_muls_a, 0, no_hashes }, diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index 2bd87e6551..0d8dadbf2f 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -335,6 +335,7 @@ u3_noun u3wf_la_sub(u3_noun); u3_noun u3wf_la_mul(u3_noun); u3_noun u3wf_la_div(u3_noun); + u3_noun u3wf_la_mod(u3_noun); u3_noun u3wf_la_adds(u3_noun); u3_noun u3wf_la_subs(u3_noun); u3_noun u3wf_la_muls(u3_noun); From ee9d1e10f19852a242a964b3974a0df51fd9f394 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Fri, 12 Apr 2024 19:27:09 -0500 Subject: [PATCH 13/97] Posting some jets. --- pkg/noun/jets/f/lagoon.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index 84c58bd191..9edb40095a 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -99,18 +99,6 @@ return dims; } -/* -*/ - static inline size_t _get_array_length(c3_d* array) - { - size_t n = sizeof(array)/sizeof(array[0]); - for (size_t i = 0; i < n; i++) { - fprintf(stderr, "%x ", array[i]); - } - fprintf(stderr, " => %x \n", n); - return n; - } - /* add - axpy = 1*x+y */ u3_noun @@ -444,31 +432,18 @@ for (c3_d i = 0; i < len_x; i++) { float128_t x_val128 = ((float128_t*)x_bytes)[i]; float128_t y_val128 = ((float128_t*)y_bytes)[i]; - fprintf(stderr, "x_val128: %llx %llx\r\n", x_val128.v[0], x_val128.v[1]); - fprintf(stderr, "y_val128: %llx %llx\r\n", y_val128.v[0], y_val128.v[1]); // Perform division x/n float128_t div_result128; - // float128_t div_result128 = f128_div(x_val128, y_val128); f128M_div((float128_t*)&x_val128, (float128_t*)&y_val128, (float128_t*)&div_result128); - fprintf(stderr, "div_result128: %llx %llx\r\n", div_result128.v[0], div_result128.v[1]); // Compute floor of the division result - int64_t floor_result128 = f128_to_i64(div_result128, softfloat_round_minMag, false); - fprintf(stderr, "floor_result128: %llx\r\n", floor_result128); + int64_t floor_result128 = f128_to_i64(div_result128, rnd, false); float128_t floor_float128 = i64_to_f128(floor_result128); - fprintf(stderr, "floor_float128: %llx %llx\r\n", floor_float128.v[0], floor_float128.v[1]); // Multiply n by floor(x/n) float128_t mult_result128; - // float128_t mult_result128 = f128_mul(y_val128, floor_float128); f128M_mul(((float128_t*)&y_val128), ((float128_t*)&floor_float128), ((float128_t*)&mult_result128)); - fprintf(stderr, "mult_result128: %llx %llx\r\n", mult_result128.v[0], mult_result128.v[1]); // Compute remainder: x - n * floor(x/n) - // ((float128_t*)y_bytes)[i] = f128_sub(x_val128, mult_result128); - f128M_div(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)y_bytes)[i])); - fprintf(stderr, "y_bytes: %llx %llx\r\n", ((float128_t*)y_bytes)[i].v[0], ((float128_t*)y_bytes)[i].v[1]); + f128M_sub(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)y_bytes)[i])); } - // for (c3_d i = 0; i < len_x; i++) { - // f128M_div(&(((float128_t*)y_bytes)[i]), &(((float128_t*)x_bytes)[i]), &(((float128_t*)y_bytes)[i])); - // } break; } From a5c79a519c476edf0ec040567e9542618c063da8 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Sat, 13 Apr 2024 12:37:46 -0500 Subject: [PATCH 14/97] Post mod/mods jets. --- pkg/noun/jets/f/lagoon.c | 217 ++++++++++++++++++++++++++++++++++----- pkg/noun/jets/q.h | 3 +- pkg/noun/jets/tree.c | 2 + pkg/noun/jets/w.h | 1 + 4 files changed, 198 insertions(+), 25 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index 9edb40095a..8ae1e77e03 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -355,8 +355,7 @@ u3qf_la_mod_real(u3_noun x_data, u3_noun y_data, u3_noun shape, - u3_noun bloq, - u3_noun rnd) + u3_noun bloq) { // Fence on valid bloq size. if (bloq < 4 || bloq > 7) { @@ -387,7 +386,7 @@ // Perform division x/n float16_t div_result16 = f16_div(x_val16, y_val16); // Compute floor of the division result - int64_t floor_result16 = f16_to_i64(div_result16, rnd, false); + int64_t floor_result16 = f16_to_i64(div_result16, softfloat_round_minMag, false); float16_t floor_float16 = i64_to_f16(floor_result16); // Multiply n by floor(x/n) float16_t mult_result16 = f16_mul(y_val16, floor_float16); @@ -403,7 +402,7 @@ // Perform division x/n float32_t div_result32 = f32_div(x_val32, y_val32); // Compute floor of the division result - int64_t floor_result32 = f32_to_i64(div_result32, rnd, false); + int64_t floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false); float32_t floor_float32 = i64_to_f32(floor_result32); // Multiply n by floor(x/n) float32_t mult_result32 = f32_mul(y_val32, floor_float32); @@ -419,7 +418,7 @@ // Perform division x/n float64_t div_result64 = f64_div(x_val64, y_val64); // Compute floor of the division result - int64_t floor_result64 = f64_to_i64(div_result64, rnd, false); + int64_t floor_result64 = f64_to_i64(div_result64, softfloat_round_minMag, false); float64_t floor_float64 = i64_to_f64(floor_result64); // Multiply n by floor(x/n) float64_t mult_result64 = f64_mul(y_val64, floor_float64); @@ -436,7 +435,7 @@ float128_t div_result128; f128M_div((float128_t*)&x_val128, (float128_t*)&y_val128, (float128_t*)&div_result128); // Compute floor of the division result - int64_t floor_result128 = f128_to_i64(div_result128, rnd, false); + int64_t floor_result128 = f128_to_i64(div_result128, softfloat_round_minMag, false); float128_t floor_float128 = i64_to_f128(floor_result128); // Multiply n by floor(x/n) float128_t mult_result128; @@ -710,35 +709,170 @@ u3r_bytes(0, siz_x, x_bytes, x_data); x_bytes[siz_x] = 1; // pin head - float16_t n16; - float32_t n32; - float64_t n64; - float128_t n128; + float16_t in16; + float32_t in32; + float64_t in64; + float128_t in128; // Switch on the block size. switch (bloq) { case 4: - u3r_bytes(0, 2, (c3_y*)&(n16.v), n); - n16 = f16_div((float16_t){SB_REAL16_ONE}, n16); - hscal(len_x, n16, (float16_t*)x_bytes, 1); + // XX note that in16 is doing double duty here + u3r_bytes(0, 2, (c3_y*)&(in16.v), n); + in16 = f16_div((float16_t){SB_REAL16_ONE}, in16); + hscal(len_x, in16, (float16_t*)x_bytes, 1); break; case 5: + // XX note that in32 is doing double duty here + u3r_bytes(0, 4, (c3_y*)&(in32.v), n); + in32 = f32_div((float32_t){SB_REAL32_ONE}, in32); + sscal(len_x, in32, (float32_t*)x_bytes, 1); + break; + + case 6: + // XX note that in64 is doing double duty here + u3r_bytes(0, 8, (c3_y*)&(in64.v), n); + in64 = f64_div((float64_t){SB_REAL64_ONE}, in64); + dscal(len_x, in64, (float64_t*)x_bytes, 1); + break; + + case 7: + // XX note that in128 is doing double duty here + u3r_bytes(0, 16, (c3_y*)&(in128.v[0]), n); + f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}), &in128, &in128); + qscal(len_x, in128, (float128_t*)x_bytes, 1); + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), x_bytes); + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* mods - x % [n] = x - r*floor(x/r) + remainder after scalar division +*/ + u3_noun + u3qf_la_mods_real(u3_noun x_data, + u3_noun n, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + fprintf(stderr, "len_x: %ld\r\n", len_x); + + // siz_x is length in bytes + c3_d siz_x = len_x * pow(2, bloq-3); + fprintf(stderr, "siz_x: %ld\r\n", siz_x); + + // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + // we reuse it for results for parsimony + c3_y* x_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); + u3r_bytes(0, siz_x+1, x_bytes, x_data); + for (c3_d i = 0; i < siz_x+1; i++) { + fprintf(stderr, "x_bytes[%ld]: %x\r\n", i, x_bytes[i]); + } + + float16_t n16, in16; + float32_t n32, in32; + float64_t n64, in64; + float128_t n128, in128; + + // Switch on the block size. + switch (bloq) { + case 4: + u3r_bytes(0, 2, (c3_y*)&n16, n); + in16 = f16_div((float16_t){SB_REAL16_ONE}, n16); + + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + // Perform division x/n + float16_t div_result16 = f16_mul(in16, x_val16); + // Compute floor of the division result + int64_t floor_result16 = f16_to_i64(div_result16, softfloat_round_minMag, false); + float16_t floor_float16 = i64_to_f16(floor_result16); + // Multiply n by floor(x/n) + float16_t mult_result16 = f16_mul(n16, floor_float16); + // Compute remainder: x - n * floor(x/n) + ((float16_t*)x_bytes)[i] = f16_sub(x_val16, mult_result16); + } + break; + + case 5: + u3l_log("n: %x", n); u3r_bytes(0, 4, (c3_y*)&(n32.v), n); - n32 = f32_div((float32_t){SB_REAL32_ONE}, n32); - sscal(len_x, n32, (float32_t*)x_bytes, 1); + fprintf(stderr, "n32: %f\r\n", n32.v); + in32 = f32_div((float32_t){SB_REAL32_ONE}, n32); + fprintf(stderr, "in32: %f\r\n", in32); + + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + fprintf(stderr, "x_val32: %f\r\n", (float32_t)x_val32); + // Perform division x/n + float32_t div_result32 = f32_mul((float32_t)in32, (float32_t)x_val32); + fprintf(stderr, "div_result32: %f\r\n", div_result32); + // Compute floor of the division result + int64_t floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false); + fprintf(stderr, "floor_result32: %ld\r\n", floor_result32); + float32_t floor_float32 = i64_to_f32(floor_result32); + fprintf(stderr, "floor_float32: %f\r\n", floor_float32); + // Multiply n by floor(x/n) + float32_t mult_result32 = f32_mul(n32, floor_float32); + fprintf(stderr, "mult_result32: %f\r\n", mult_result32); + // Compute remainder: x - n * floor(x/n) + ((float32_t*)x_bytes)[i] = f32_sub(x_val32, mult_result32); + fprintf(stderr, "x_bytes[i]: %f\r\n\r\n", ((float32_t*)x_bytes)[i]); + } break; case 6: - u3r_bytes(0, 8, (c3_y*)&(n64.v), n); - n64 = f64_div((float64_t){SB_REAL64_ONE}, n64); - dscal(len_x, n64, (float64_t*)x_bytes, 1); + u3r_bytes(0, 8, (c3_y*)&n64, n); + in64 = f64_div((float64_t){SB_REAL64_ONE}, n64); + + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + // Perform division x/n + float64_t div_result64 = f64_mul(in64, x_val64); + // Compute floor of the division result + int64_t floor_result64 = f64_to_i64(div_result64, softfloat_round_minMag, false); + float64_t floor_float64 = i64_to_f64(floor_result64); + // Multiply n by floor(x/n) + float64_t mult_result64 = f64_mul(n64, floor_float64); + // Compute remainder: x - n * floor(x/n) + ((float64_t*)x_bytes)[i] = f64_sub(x_val64, mult_result64); + } break; case 7: - u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); - f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}), &n128, &n128); - qscal(len_x, n128, (float128_t*)x_bytes, 1); + u3r_bytes(0, 16, (c3_y*)&n128, n); + f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ZERO}), &n128, &in128); + + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + // Perform division x/n + float128_t div_result128; + f128M_mul((float128_t*)&in128, (float128_t*)&x_val128, (float128_t*)&div_result128); + // Compute floor of the division result + int64_t floor_result128 = f128_to_i64(div_result128, softfloat_round_minMag, false); + float128_t floor_float128 = i64_to_f128(floor_result128); + // Multiply n by floor(x/n) + float128_t mult_result128; + f128M_mul(((float128_t*)&n128), ((float128_t*)&floor_float128), ((float128_t*)&mult_result128)); + // Compute remainder: x - n * floor(x/n) + f128M_sub(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)x_bytes)[i])); + } break; } @@ -1216,9 +1350,9 @@ return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: ; // XX satisfy label - // Global rounding mode is ignored by SoftFloat conversions so we pass it in. - u3_noun r_data = u3qf_la_mod_real(x_data, y_data, x_shape, x_bloq, rnd); + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_mod_real(x_data, y_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -1368,6 +1502,41 @@ } } + u3_noun + u3wf_la_mods(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + switch (x_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_mods_real(x_data, n, x_shape, x_bloq); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + + default: + return u3_none; + } + } + } + u3_noun u3wf_la_dot(u3_noun cor) { diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index 63d0bb724e..0b504b185e 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -251,11 +251,12 @@ u3_noun u3qf_la_sub_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_mul_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_div_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_mod_real(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_mod_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_adds_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_subs_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_muls_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_divs_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_mods_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_dot_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_diag(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_trace_real(u3_noun, u3_noun, u3_noun); diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index d8375dc4fc..c6c148d83f 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2160,6 +2160,7 @@ static u3j_harm _139_hex__lagoon_adds_a[] = {{".2", u3wf_la_adds}, {}}; static u3j_harm _139_hex__lagoon_subs_a[] = {{".2", u3wf_la_subs}, {}}; static u3j_harm _139_hex__lagoon_muls_a[] = {{".2", u3wf_la_muls}, {}}; static u3j_harm _139_hex__lagoon_divs_a[] = {{".2", u3wf_la_divs}, {}}; +static u3j_harm _139_hex__lagoon_mods_a[] = {{".2", u3wf_la_mods}, {}}; static u3j_harm _139_hex__lagoon_dot_a[] = {{".2", u3wf_la_dot}, {}}; static u3j_harm _139_hex__lagoon_diag_a[] = {{".2", u3wf_la_diag}, {}}; static u3j_harm _139_hex__lagoon_trace_a[]= {{".2", u3wf_la_trace}, {}}; @@ -2174,6 +2175,7 @@ static u3j_core _139_hex__la_core_d[] = { "sub-scal", 7, _139_hex__lagoon_subs_a, 0, no_hashes }, { "mul-scal", 7, _139_hex__lagoon_muls_a, 0, no_hashes }, { "div-scal", 7, _139_hex__lagoon_divs_a, 0, no_hashes }, + { "mod-scal", 7, _139_hex__lagoon_mods_a, 0, no_hashes }, { "dot", 7, _139_hex__lagoon_dot_a, 0, no_hashes }, { "diag", 7, _139_hex__lagoon_diag_a, 0, no_hashes }, { "trace", 7, _139_hex__lagoon_trace_a,0, no_hashes }, diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index 0d8dadbf2f..f36e490e2c 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -340,6 +340,7 @@ u3_noun u3wf_la_subs(u3_noun); u3_noun u3wf_la_muls(u3_noun); u3_noun u3wf_la_divs(u3_noun); + u3_noun u3wf_la_mods(u3_noun); u3_noun u3wf_la_dot(u3_noun); u3_noun u3wf_la_diag(u3_noun); u3_noun u3wf_la_trace(u3_noun); From 110580373116fb645e4bf47ff8436171ff675b8d Mon Sep 17 00:00:00 2001 From: Sigilante Date: Mon, 15 Apr 2024 15:00:36 -0500 Subject: [PATCH 15/97] Add transpose and fixed other jets. --- pkg/noun/jets/f/lagoon.c | 208 +++++++++++++++++++++++++-------------- pkg/noun/jets/q.h | 1 + pkg/noun/jets/tree.c | 2 + pkg/noun/jets/w.h | 1 + 4 files changed, 140 insertions(+), 72 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index 8ae1e77e03..1318b532a2 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -491,7 +491,7 @@ // Switch on the block size. We assume that n fits in the target block size; Hoon typecheck should prevent. switch (bloq) { case 4: - u3r_bytes(0, 2, (c3_y*)&n16, n); + u3r_bytes(0, 2, (c3_y*)&(n16.v), n); // set y to [n] for (c3_d i = 0; i < len_x; i++) { ((float16_t*)y_bytes)[i] = n16; @@ -500,7 +500,7 @@ break; case 5: - u3r_bytes(0, 4, (c3_y*)&n32, n); + u3r_bytes(0, 4, (c3_y*)&(n32.v), n); // set y to [n] for (c3_d i = 0; i < len_x; i++) { ((float32_t*)y_bytes)[i] = n32; @@ -509,7 +509,7 @@ break; case 6: - u3r_bytes(0, 8, (c3_y*)&n64, n); + u3r_bytes(0, 8, (c3_y*)&(n64.v), n); // set y to [n] for (c3_d i = 0; i < len_x; i++) { ((float64_t*)y_bytes)[i] = n64; @@ -518,7 +518,7 @@ break; case 7: - u3r_bytes(0, 16, (c3_y*)&n128, n); + u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); // set y to [n] for (c3_d i = 0; i < len_x; i++) { ((float128_t*)y_bytes)[i] = (float128_t){n128.v[0], n128.v[1]}; @@ -573,7 +573,7 @@ // Switch on the block size. We assume that n fits in the target block size; Hoon typecheck should prevent. switch (bloq) { case 4: - u3r_bytes(0, 2, (c3_y*)&n16, n); + u3r_bytes(0, 2, (c3_y*)&(n16.v), n); // set y to [n] for (c3_d i = 0; i < len_x; i++) { ((float16_t*)y_bytes)[i] = n16; @@ -582,7 +582,7 @@ break; case 5: - u3r_bytes(0, 4, (c3_y*)&n32, n); + u3r_bytes(0, 4, (c3_y*)&(n32.v), n); // set y to [n] for (c3_d i = 0; i < len_x; i++) { ((float32_t*)y_bytes)[i] = n32; @@ -591,7 +591,7 @@ break; case 6: - u3r_bytes(0, 8, (c3_y*)&n64, n); + u3r_bytes(0, 8, (c3_y*)&(n64.v), n); // set y to [n] for (c3_d i = 0; i < len_x; i++) { ((float64_t*)y_bytes)[i] = n64; @@ -600,7 +600,7 @@ break; case 7: - u3r_bytes(0, 16, (c3_y*)&n128, n); + u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); // set y to [n] for (c3_d i = 0; i < len_x; i++) { ((float128_t*)y_bytes)[i] = (float128_t){n128.v[0], n128.v[1]}; @@ -654,17 +654,17 @@ // Switch on the block size. switch (bloq) { case 4: - u3r_bytes(0, 2, (c3_y*)&n16, n); + u3r_bytes(0, 2, (c3_y*)&(n16.v), n); hscal(len_x, n16, (float16_t*)x_bytes, 1); break; case 5: - u3r_bytes(0, 4, (c3_y*)&n32, n); + u3r_bytes(0, 4, (c3_y*)&(n32.v), n); sscal(len_x, n32, (float32_t*)x_bytes, 1); break; case 6: - u3r_bytes(0, 8, (c3_y*)&n64, n); + u3r_bytes(0, 8, (c3_y*)&(n64.v), n); dscal(len_x, n64, (float64_t*)x_bytes, 1); break; @@ -771,19 +771,14 @@ // Unpack the data as a byte array. We assume total length < 2**64. // len_x is length in base units c3_d len_x = _get_length(shape); - fprintf(stderr, "len_x: %ld\r\n", len_x); // siz_x is length in bytes c3_d siz_x = len_x * pow(2, bloq-3); - fprintf(stderr, "siz_x: %ld\r\n", siz_x); // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) // we reuse it for results for parsimony c3_y* x_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); u3r_bytes(0, siz_x+1, x_bytes, x_data); - for (c3_d i = 0; i < siz_x+1; i++) { - fprintf(stderr, "x_bytes[%ld]: %x\r\n", i, x_bytes[i]); - } float16_t n16, in16; float32_t n32, in32; @@ -793,7 +788,7 @@ // Switch on the block size. switch (bloq) { case 4: - u3r_bytes(0, 2, (c3_y*)&n16, n); + u3r_bytes(0, 2, (c3_y*)&(n16.v), n); in16 = f16_div((float16_t){SB_REAL16_ONE}, n16); for (c3_d i = 0; i < len_x; i++) { @@ -811,34 +806,25 @@ break; case 5: - u3l_log("n: %x", n); u3r_bytes(0, 4, (c3_y*)&(n32.v), n); - fprintf(stderr, "n32: %f\r\n", n32.v); in32 = f32_div((float32_t){SB_REAL32_ONE}, n32); - fprintf(stderr, "in32: %f\r\n", in32); for (c3_d i = 0; i < len_x; i++) { float32_t x_val32 = ((float32_t*)x_bytes)[i]; - fprintf(stderr, "x_val32: %f\r\n", (float32_t)x_val32); // Perform division x/n float32_t div_result32 = f32_mul((float32_t)in32, (float32_t)x_val32); - fprintf(stderr, "div_result32: %f\r\n", div_result32); // Compute floor of the division result int64_t floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false); - fprintf(stderr, "floor_result32: %ld\r\n", floor_result32); float32_t floor_float32 = i64_to_f32(floor_result32); - fprintf(stderr, "floor_float32: %f\r\n", floor_float32); // Multiply n by floor(x/n) float32_t mult_result32 = f32_mul(n32, floor_float32); - fprintf(stderr, "mult_result32: %f\r\n", mult_result32); // Compute remainder: x - n * floor(x/n) ((float32_t*)x_bytes)[i] = f32_sub(x_val32, mult_result32); - fprintf(stderr, "x_bytes[i]: %f\r\n\r\n", ((float32_t*)x_bytes)[i]); } break; case 6: - u3r_bytes(0, 8, (c3_y*)&n64, n); + u3r_bytes(0, 8, (c3_y*)&(n64.v), n); in64 = f64_div((float64_t){SB_REAL64_ONE}, n64); for (c3_d i = 0; i < len_x; i++) { @@ -856,7 +842,7 @@ break; case 7: - u3r_bytes(0, 16, (c3_y*)&n128, n); + u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ZERO}), &n128, &in128); for (c3_d i = 0; i < len_x; i++) { @@ -913,28 +899,39 @@ c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); u3r_bytes(0, siz_x+1, y_bytes, y_data); + u3_noun r_data; + // Switch on the block size. switch (bloq) { - case 4: - hdot(len_x, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + case 4: ; + float16_t r16[2]; + r16[0] = hdot(len_x, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + r16[1].v = 0x1; + r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)r16); break; - case 5: - sdot(len_x, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + case 5: ; + float32_t r32[2]; + r32[0] = sdot(len_x, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + r32[1].v = 0x1; + r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)r32); break; - case 6: - ddot(len_x, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + case 6: ; + float64_t r64[2]; + r64[0] = ddot(len_x, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + r64[1].v = 0x1; + r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)r64); break; - case 7: - qdot(len_x, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + case 7: ; + float128_t r128[2]; + r128[0] = qdot(len_x, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + r128[1] = (float128_t){0x1, 0x0}; + r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)r128); break; } - // r_data is the result noun of [data] - u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); - // Clean up and return. u3a_free(x_bytes); u3a_free(y_bytes); @@ -949,6 +946,10 @@ u3_noun shape, u3_noun bloq) { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } // Assert length of dims is 2. assert(u3qb_lent(shape) == 2); // Unpack shape into an array of dimensions. @@ -958,18 +959,19 @@ // Unpack the data as a byte array. We assume total length < 2**64. c3_d len_x = _get_length(shape); c3_d siz_x = len_x * pow(2, bloq - 3); - c3_d stride = dims[0] * pow(2, bloq - 3); + c3_d wyd = pow(2, bloq - 3); c3_y* x_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); u3r_bytes(0, siz_x+1, x_bytes, x_data); - c3_d siz_y = stride * dims[1]; + c3_d siz_y = wyd * dims[1]; c3_y* y_bytes = (c3_y*)u3a_malloc((siz_y+1)*sizeof(c3_y)); u3_noun r_data; + // Grab the index at i*n_x+j in bytes; put it at j. for (c3_d i = 0; i < dims[1]; i++) { - for (c3_d j = 0; j < stride; j++) { - fprintf(stderr, "i*s+j = %d*%d+%d = %d // x_bytes[i]: %lx\r\n", i, stride, j, i*stride+j, x_bytes[i*stride+j + i]); - y_bytes[i*stride+j] = x_bytes[i*stride+j + i]; + // Scan across whole field width. + for (c3_y k = 0; k < wyd; k++) { + y_bytes[i*wyd+k] = x_bytes[(i*dims[0]+i)*wyd+k]; } } y_bytes[siz_y] = 1; // pin head @@ -984,6 +986,49 @@ return r_data; } +/* transpose - x' +*/ + u3_noun + u3qf_la_transpose(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Assert length of dims is 2. + assert(u3qb_lent(shape) == 2); + // Unpack shape into an array of dimensions. + c3_d *dims = _get_dims(shape); + + // Unpack the data as a byte array. We assume total length < 2**64. + c3_d len_x = _get_length(shape); + c3_d siz_x = len_x * pow(2, bloq - 3); + c3_d wyd = pow(2, bloq - 3); + c3_y* x_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); + u3r_bytes(0, siz_x+1, x_bytes, x_data); + c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); + + u3_noun r_data; + + // Grab the index at i*n_x+j in bytes; put it at j. + for (c3_d i = 0; i < dims[1]; i++) { + for (c3_d j = 0; j < dims[0]; j++) { + // Scan across whole field width. + for (c3_y k = 0; k < wyd; k++) { + y_bytes[(j*dims[1]+i)*wyd+k] = x_bytes[(i*dims[0]+j)*wyd+k]; + } + } + } + y_bytes[siz_x] = 1; // pin head + + // Unpack the result back into a noun. + r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); + + u3a_free(x_bytes); + u3a_free(y_bytes); + u3a_free(dims); + + return r_data; + } + /* trace - tr(x) */ u3_noun @@ -991,9 +1036,10 @@ u3_noun shape, u3_noun bloq) { - u3_noun diag_data = u3qf_la_diag(x_data, shape, bloq); + u3_noun d_data = u3qf_la_diag(x_data, shape, bloq); c3_d len_x0 = _get_dims(shape)[0]; - return u3qf_la_dot_real(diag_data, diag_data, u3nt(len_x0, 0x1, u3_nul), bloq); + u3_noun r_data = u3qf_la_dot_real(d_data, d_data, u3nt(len_x0, 0x1, u3_nul), bloq); + return r_data; } /* mmul @@ -1583,7 +1629,8 @@ case c3__real: _set_rounding(rnd); u3_noun r_data = u3qf_la_dot_real(x_data, y_data, x_shape, x_bloq); - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + c3_d len_x0 = _get_dims(x_shape)[0]; + return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: return u3_none; @@ -1606,26 +1653,51 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, - rnd; - if ( c3n == u3r_mean(x_meta, - 2, &x_shape, - 6, &x_bloq, - 14, &x_kind, - 15, &x_fxp, - 0) - // c3n == u3r_sing(x_shape, y_shape) || - // c3n == u3r_sing(x_bloq, y_bloq) || - // c3n == u3r_sing(x_kind, y_kind) || - // c3n == u3r_sing(x_fxp, y_fxp) || - // c3n == u3r_mean(cor, u3x_con_sam, &rnd, 0) + u3_noun x_shape, x_bloq, x_kind, x_fxp; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) ) { return u3m_bail(c3__exit); } else { u3_noun r_data = u3qf_la_diag(x_data, x_shape, x_bloq); c3_d len_x0 = _get_dims(x_shape)[0]; - return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), x_bloq, x_kind, x_fxp), r_data); + return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + } + } + } + + u3_noun + u3wf_la_transpose(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + u3_noun r_data = u3qf_la_transpose(x_data, x_shape, x_bloq); + return u3nc(u3nq(u3nt(u3k(u3h(x_shape)), u3k(u3h(u3t(x_shape))), u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); } } } @@ -1644,29 +1716,21 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, - rnd; + u3_noun x_shape, x_bloq, x_kind, x_fxp; if ( c3n == u3r_mean(x_meta, 2, &x_shape, 6, &x_bloq, 14, &x_kind, 15, &x_fxp, 0) - // c3n == u3r_sing(x_shape, y_shape) || - // c3n == u3r_sing(x_bloq, y_bloq) || - // c3n == u3r_sing(x_kind, y_kind) || - // c3n == u3r_sing(x_fxp, y_fxp) || - // c3n == u3r_mean(cor, u3x_con_sam, &rnd, 0) ) { return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: - _set_rounding(rnd); + case c3__real: ; u3_noun r_data = u3qf_la_trace_real(x_data, x_shape, x_bloq); - return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), x_bloq, x_kind, x_fxp), r_data); - break; + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: return u3_none; diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index 0b504b185e..37f4ccc127 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -259,6 +259,7 @@ u3_noun u3qf_la_mods_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_dot_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_diag(u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_transpose(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_trace_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_mmul_real(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index c6c148d83f..db84df3780 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2162,6 +2162,7 @@ static u3j_harm _139_hex__lagoon_muls_a[] = {{".2", u3wf_la_muls}, {}}; static u3j_harm _139_hex__lagoon_divs_a[] = {{".2", u3wf_la_divs}, {}}; static u3j_harm _139_hex__lagoon_mods_a[] = {{".2", u3wf_la_mods}, {}}; static u3j_harm _139_hex__lagoon_dot_a[] = {{".2", u3wf_la_dot}, {}}; +static u3j_harm _139_hex__lagoon_trans_a[] ={{".2", u3wf_la_transpose}, {}}; static u3j_harm _139_hex__lagoon_diag_a[] = {{".2", u3wf_la_diag}, {}}; static u3j_harm _139_hex__lagoon_trace_a[]= {{".2", u3wf_la_trace}, {}}; static u3j_harm _139_hex__lagoon_mmul_a[] = {{".2", u3wf_la_mmul}, {}}; @@ -2177,6 +2178,7 @@ static u3j_core _139_hex__la_core_d[] = { "div-scal", 7, _139_hex__lagoon_divs_a, 0, no_hashes }, { "mod-scal", 7, _139_hex__lagoon_mods_a, 0, no_hashes }, { "dot", 7, _139_hex__lagoon_dot_a, 0, no_hashes }, + { "transpose",7, _139_hex__lagoon_trans_a, 0, no_hashes }, { "diag", 7, _139_hex__lagoon_diag_a, 0, no_hashes }, { "trace", 7, _139_hex__lagoon_trace_a,0, no_hashes }, { "mmul", 7, _139_hex__lagoon_mmul_a, 0, no_hashes }, diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index f36e490e2c..36d66c21ba 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -343,6 +343,7 @@ u3_noun u3wf_la_mods(u3_noun); u3_noun u3wf_la_dot(u3_noun); u3_noun u3wf_la_diag(u3_noun); + u3_noun u3wf_la_transpose(u3_noun); u3_noun u3wf_la_trace(u3_noun); u3_noun u3wf_la_mmul(u3_noun); From 7eab0d7830494e5171200b0c31558da49a54f00e Mon Sep 17 00:00:00 2001 From: Sigilante Date: Tue, 16 Apr 2024 15:52:41 -0500 Subject: [PATCH 16/97] Add +abs etc. --- pkg/noun/jets/f/lagoon.c | 972 ++++++++++++++++++++++++++++++++++----- pkg/noun/jets/q.h | 9 + pkg/noun/jets/tree.c | 30 ++ pkg/noun/jets/w.h | 9 + 4 files changed, 899 insertions(+), 121 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index 1318b532a2..7ffc7fd7ad 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -117,16 +117,16 @@ // len_x is length in base units c3_d len_x = _get_length(shape); - // siz_x is length in bytes - c3_d siz_x = len_x * pow(2, bloq-3); + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); // x_bytes is the data array (w/o leading 0x1) - c3_y* x_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); - u3r_bytes(0, siz_x, x_bytes, x_data); + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) - c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); - u3r_bytes(0, siz_x+1, y_bytes, y_data); + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. switch (bloq) { @@ -148,7 +148,7 @@ } // r_data is the result noun of [data] - u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); // Clean up and return. u3a_free(x_bytes); @@ -175,16 +175,16 @@ // len_x is length in base units c3_d len_x = _get_length(shape); - // siz_x is length in bytes - c3_d siz_x = len_x * pow(2, bloq-3); + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); // x_bytes is the data array (w/o leading 0x1) - c3_y* x_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); - u3r_bytes(0, siz_x, x_bytes, x_data); + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) - c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); - u3r_bytes(0, siz_x+1, y_bytes, y_data); + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. switch (bloq) { @@ -206,7 +206,7 @@ } // r_data is the result noun of [data] - u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); // Clean up and return. u3a_free(x_bytes); @@ -234,16 +234,16 @@ // len_x is length in base units c3_d len_x = _get_length(shape); - // siz_x is length in bytes - c3_d siz_x = len_x * pow(2, bloq-3); + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); // x_bytes is the data array (w/o leading 0x1) - c3_y* x_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); - u3r_bytes(0, siz_x, x_bytes, x_data); + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) - c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); - u3r_bytes(0, siz_x+1, y_bytes, y_data); + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. switch (bloq) { @@ -273,7 +273,7 @@ } // r_data is the result noun of [data] - u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); // Clean up and return. u3a_free(x_bytes); @@ -300,16 +300,16 @@ // len_x is length in base units c3_d len_x = _get_length(shape); - // siz_x is length in bytes - c3_d siz_x = len_x * pow(2, bloq-3); + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); // x_bytes is the data array (w/o leading 0x1) - c3_y* x_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); - u3r_bytes(0, siz_x, x_bytes, x_data); + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) - c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); - u3r_bytes(0, siz_x+1, y_bytes, y_data); + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. switch (bloq) { @@ -339,7 +339,7 @@ } // r_data is the result noun of [data] - u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); // Clean up and return. u3a_free(x_bytes); @@ -366,16 +366,16 @@ // len_x is length in base units c3_d len_x = _get_length(shape); - // siz_x is length in bytes - c3_d siz_x = len_x * pow(2, bloq-3); + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); // x_bytes is the data array (w/o leading 0x1) - c3_y* x_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); - u3r_bytes(0, siz_x, x_bytes, x_data); + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) - c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); - u3r_bytes(0, siz_x+1, y_bytes, y_data); + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. switch (bloq) { @@ -447,7 +447,358 @@ } // r_data is the result noun of [data] - u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* abs - |x| +*/ + u3_noun + u3qf_la_abs_real(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by for range) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + + // Switch on the block size. + switch (bloq) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + ((float16_t*)x_bytes)[i] = f16_abs(((float16_t*)x_bytes)[i]); + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + ((float32_t*)x_bytes)[i] = f32_abs(((float32_t*)x_bytes)[i]); + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + ((float64_t*)x_bytes)[i] = f64_abs(((float64_t*)x_bytes)[i]); + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + ((float128_t*)x_bytes)[i] = f128_abs(((float128_t*)x_bytes)[i]); + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* gth - x > y +*/ + u3_noun + u3qf_la_gth_real(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (bloq) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + float16_t y_val16 = ((float16_t*)y_bytes)[i]; + ((float16_t*)y_bytes)[i] = f16_gt(x_val16, y_val16) ? (float16_t){SB_REAL16_ONE} : (float16_t){SB_REAL16_ZERO}; + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + float32_t y_val32 = ((float32_t*)y_bytes)[i]; + ((float32_t*)y_bytes)[i] = f32_gt(x_val32, y_val32) ? (float32_t){SB_REAL32_ONE} : (float32_t){SB_REAL32_ZERO}; + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + float64_t y_val64 = ((float64_t*)y_bytes)[i]; + ((float64_t*)y_bytes)[i] = f64_gt(x_val64, y_val64) ? (float64_t){SB_REAL64_ONE} : (float64_t){SB_REAL64_ZERO}; + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + float128_t y_val128 = ((float128_t*)y_bytes)[i]; + ((float128_t*)y_bytes)[i] = f128M_gt(((float128_t*)&x_val128), ((float128_t*)&y_val128)) ? (float128_t){SB_REAL128L_ONE, SB_REAL128U_ONE} : (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* gte - x > y +*/ + u3_noun + u3qf_la_gte_real(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (bloq) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + float16_t y_val16 = ((float16_t*)y_bytes)[i]; + ((float16_t*)y_bytes)[i] = f16_ge(x_val16, y_val16) ? (float16_t){SB_REAL16_ONE} : (float16_t){SB_REAL16_ZERO}; + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + float32_t y_val32 = ((float32_t*)y_bytes)[i]; + ((float32_t*)y_bytes)[i] = f32_ge(x_val32, y_val32) ? (float32_t){SB_REAL32_ONE} : (float32_t){SB_REAL32_ZERO}; + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + float64_t y_val64 = ((float64_t*)y_bytes)[i]; + ((float64_t*)y_bytes)[i] = f64_ge(x_val64, y_val64) ? (float64_t){SB_REAL64_ONE} : (float64_t){SB_REAL64_ZERO}; + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + float128_t y_val128 = ((float128_t*)y_bytes)[i]; + ((float128_t*)y_bytes)[i] = f128M_ge(((float128_t*)&x_val128), ((float128_t*)&y_val128)) ? (float128_t){SB_REAL128L_ONE, SB_REAL128U_ONE} : (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* lth - x > y +*/ + u3_noun + u3qf_la_lth_real(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (bloq) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + float16_t y_val16 = ((float16_t*)y_bytes)[i]; + ((float16_t*)y_bytes)[i] = f16_lt(x_val16, y_val16) ? (float16_t){SB_REAL16_ONE} : (float16_t){SB_REAL16_ZERO}; + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + float32_t y_val32 = ((float32_t*)y_bytes)[i]; + ((float32_t*)y_bytes)[i] = f32_lt(x_val32, y_val32) ? (float32_t){SB_REAL32_ONE} : (float32_t){SB_REAL32_ZERO}; + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + float64_t y_val64 = ((float64_t*)y_bytes)[i]; + ((float64_t*)y_bytes)[i] = f64_lt(x_val64, y_val64) ? (float64_t){SB_REAL64_ONE} : (float64_t){SB_REAL64_ZERO}; + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + float128_t y_val128 = ((float128_t*)y_bytes)[i]; + ((float128_t*)y_bytes)[i] = f128M_lt(((float128_t*)&x_val128), ((float128_t*)&y_val128)) ? (float128_t){SB_REAL128L_ONE, SB_REAL128U_ONE} : (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* lte - x > y +*/ + u3_noun + u3qf_la_lte_real(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (bloq) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + float16_t y_val16 = ((float16_t*)y_bytes)[i]; + ((float16_t*)y_bytes)[i] = f16_le(x_val16, y_val16) ? (float16_t){SB_REAL16_ONE} : (float16_t){SB_REAL16_ZERO}; + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + float32_t y_val32 = ((float32_t*)y_bytes)[i]; + ((float32_t*)y_bytes)[i] = f32_le(x_val32, y_val32) ? (float32_t){SB_REAL32_ONE} : (float32_t){SB_REAL32_ZERO}; + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + float64_t y_val64 = ((float64_t*)y_bytes)[i]; + ((float64_t*)y_bytes)[i] = f64_le(x_val64, y_val64) ? (float64_t){SB_REAL64_ONE} : (float64_t){SB_REAL64_ZERO}; + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + float128_t y_val128 = ((float128_t*)y_bytes)[i]; + ((float128_t*)y_bytes)[i] = f128M_le(((float128_t*)&x_val128), ((float128_t*)&y_val128)) ? (float128_t){SB_REAL128L_ONE, SB_REAL128U_ONE} : (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); // Clean up and return. u3a_free(x_bytes); @@ -473,15 +824,15 @@ // len_x is length in base units c3_d len_x = _get_length(shape); - // siz_x is length in bytes - c3_d siz_x = len_x * pow(2, bloq-3); + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); // x_bytes is the data array (w/o leading 0x1) - c3_y* x_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); - u3r_bytes(0, siz_x, x_bytes, x_data); + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) - c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); float16_t n16; float32_t n32; @@ -528,8 +879,8 @@ } // r_data is the result noun of [data] - y_bytes[siz_x] = 1; // pin head - u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); + y_bytes[syz_x] = 1; // pin head + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); // Clean up and return. u3a_free(x_bytes); @@ -555,15 +906,15 @@ // len_x is length in base units c3_d len_x = _get_length(shape); - // siz_x is length in bytes - c3_d siz_x = len_x * pow(2, bloq-3); + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) - c3_y* x_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); - u3r_bytes(0, siz_x, x_bytes, x_data); + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); // y_bytes is the data array (w/o leading 0x1) - c3_y* y_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); + c3_y* y_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); float16_t n16; float32_t n32; @@ -610,8 +961,8 @@ } // r_data is the result noun of [data] - x_bytes[siz_x] = 1; // pin head - u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), x_bytes); + x_bytes[syz_x] = 1; // pin head + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); // Clean up and return. u3a_free(x_bytes); @@ -638,13 +989,13 @@ // len_x is length in base units c3_d len_x = _get_length(shape); - // siz_x is length in bytes - c3_d siz_x = len_x * pow(2, bloq-3); + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) - c3_y* x_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); - u3r_bytes(0, siz_x, x_bytes, x_data); - x_bytes[siz_x] = 1; // pin head + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + x_bytes[syz_x] = 1; // pin head float16_t n16; float32_t n32; @@ -675,7 +1026,7 @@ } // r_data is the result noun of [data] - u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), x_bytes); + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); // Clean up and return. u3a_free(x_bytes); @@ -701,13 +1052,13 @@ // len_x is length in base units c3_d len_x = _get_length(shape); - // siz_x is length in bytes - c3_d siz_x = len_x * pow(2, bloq-3); + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) - c3_y* x_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); - u3r_bytes(0, siz_x, x_bytes, x_data); - x_bytes[siz_x] = 1; // pin head + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + x_bytes[syz_x] = 1; // pin head float16_t in16; float32_t in32; @@ -746,7 +1097,7 @@ } // r_data is the result noun of [data] - u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), x_bytes); + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); // Clean up and return. u3a_free(x_bytes); @@ -772,13 +1123,13 @@ // len_x is length in base units c3_d len_x = _get_length(shape); - // siz_x is length in bytes - c3_d siz_x = len_x * pow(2, bloq-3); + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) // we reuse it for results for parsimony - c3_y* x_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); - u3r_bytes(0, siz_x+1, x_bytes, x_data); + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); float16_t n16, in16; float32_t n32, in32; @@ -863,7 +1214,7 @@ } // r_data is the result noun of [data] - u3_noun r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), x_bytes); + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); // Clean up and return. u3a_free(x_bytes); @@ -888,16 +1239,16 @@ // len_x is length in base units c3_d len_x = _get_length(shape); - // siz_x is length in bytes - c3_d siz_x = len_x * pow(2, bloq-3); + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); // x_bytes is the data array (w/o leading 0x1) - c3_y* x_bytes = (c3_y*)u3a_malloc(siz_x*sizeof(c3_y)); - u3r_bytes(0, siz_x, x_bytes, x_data); + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) - c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); - u3r_bytes(0, siz_x+1, y_bytes, y_data); + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); u3_noun r_data; @@ -958,12 +1309,12 @@ // Unpack the data as a byte array. We assume total length < 2**64. c3_d len_x = _get_length(shape); - c3_d siz_x = len_x * pow(2, bloq - 3); + c3_d syz_x = len_x * pow(2, bloq - 3); c3_d wyd = pow(2, bloq - 3); - c3_y* x_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); - u3r_bytes(0, siz_x+1, x_bytes, x_data); - c3_d siz_y = wyd * dims[1]; - c3_y* y_bytes = (c3_y*)u3a_malloc((siz_y+1)*sizeof(c3_y)); + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + c3_d syz_y = wyd * dims[1]; + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_y+1)*sizeof(c3_y)); u3_noun r_data; @@ -974,10 +1325,10 @@ y_bytes[i*wyd+k] = x_bytes[(i*dims[0]+i)*wyd+k]; } } - y_bytes[siz_y] = 1; // pin head + y_bytes[syz_y] = 1; // pin head // Unpack the result back into a noun. - r_data = u3i_bytes((siz_y+1)*sizeof(c3_y), y_bytes); + r_data = u3i_bytes((syz_y+1)*sizeof(c3_y), y_bytes); u3a_free(x_bytes); u3a_free(y_bytes); @@ -1000,11 +1351,11 @@ // Unpack the data as a byte array. We assume total length < 2**64. c3_d len_x = _get_length(shape); - c3_d siz_x = len_x * pow(2, bloq - 3); + c3_d syz_x = len_x * pow(2, bloq - 3); c3_d wyd = pow(2, bloq - 3); - c3_y* x_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); - u3r_bytes(0, siz_x+1, x_bytes, x_data); - c3_y* y_bytes = (c3_y*)u3a_malloc((siz_x+1)*sizeof(c3_y)); + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); u3_noun r_data; @@ -1017,10 +1368,10 @@ } } } - y_bytes[siz_x] = 1; // pin head + y_bytes[syz_x] = 1; // pin head // Unpack the result back into a noun. - r_data = u3i_bytes((siz_x+1)*sizeof(c3_y), y_bytes); + r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); u3a_free(x_bytes); u3a_free(y_bytes); @@ -1029,6 +1380,97 @@ return r_data; } +/* linspace - [a a+(b-a)/n ... b] +*/ + u3_noun + u3qf_la_linspace_real(u3_noun a, + u3_noun b, + u3_noun n, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + u3_noun r_data; + + switch (bloq) { + case 4: ; + float16_t a16, b16; + u3r_bytes(0, 2, (c3_y*)&(a16.v), a); + u3r_bytes(0, 2, (c3_y*)&(b16.v), b); + float16_t span16 = f16_sub(b16, a16); + float16_t interval16 = f16_div(span16, i32_to_f16(n)); + c3_y* x_bytes16 = (c3_y*)u3a_malloc(((n+1)*2+1)*sizeof(c3_y)); + for (c3_d i = 1; i <= n; i++) { + ((float16_t*)x_bytes16)[n-i] = f16_add(a16, f16_mul(i32_to_f16(i), interval16)); + } + ((float16_t*)x_bytes16)[n] = b16; + x_bytes16[(n+1)*2] = 1; // pin head + r_data = u3i_bytes(((n+1)*2+1)*sizeof(c3_y), x_bytes16); + u3a_free(x_bytes16); + break; + + case 5: ; + float32_t a32, b32; + u3r_bytes(0, 4, (c3_y*)&(a32.v), a); + u3r_bytes(0, 4, (c3_y*)&(b32.v), b); + float32_t span32 = f32_sub(b32, a32); + float32_t interval32 = f32_div(span32, i32_to_f32(n)); + c3_y* x_bytes32 = (c3_y*)u3a_malloc(((n+1)*4+1)*sizeof(c3_y)); + for (c3_d i = 1; i <= n; i++) { + ((float32_t*)x_bytes32)[n-i] = f32_add(a32, f32_mul(i32_to_f32(i), interval32)); + } + ((float32_t*)x_bytes32)[n] = b32; + x_bytes32[(n+1)*4] = 1; // pin head + r_data = u3i_bytes(((n+1)*4+1)*sizeof(c3_y), x_bytes32); + u3a_free(x_bytes32); + break; + + case 6: ; + float64_t a64, b64; + u3r_bytes(0, 8, (c3_y*)&(a64.v), a); + u3r_bytes(0, 8, (c3_y*)&(b64.v), b); + float64_t span64 = f64_sub(b64, a64); + float64_t interval64 = f64_div(span64, i32_to_f64(n)); + c3_y* x_bytes64 = (c3_y*)u3a_malloc(((n+1)*8+1)*sizeof(c3_y)); + for (c3_d i = 1; i < n; i++) { + ((float64_t*)x_bytes64)[n-i] = f64_add(a64, f64_mul(i32_to_f64(i), interval64)); + } + ((float64_t*)x_bytes64)[n] = b64; + x_bytes64[(n+1)*8] = 1; // pin head + r_data = u3i_bytes(((n+1)*8+1)*sizeof(c3_y), x_bytes64); + u3a_free(x_bytes64); + break; + + case 7: ; + float128_t a128, b128; + u3r_bytes(0, 16, (c3_y*)&(a128.v[0]), a); + u3r_bytes(0, 16, (c3_y*)&(b128.v[0]), b); + float128_t span128; + f128M_sub(&b128, &a128, &span128); + float128_t interval128; + float128_t n128; + i32_to_f128M(n, &n128); + f128M_div(&span128, &n128, &interval128); + c3_y* x_bytes128 = (c3_y*)u3a_malloc(((n+1)*16+1)*sizeof(c3_y)); + float128_t i128; + for (c3_d i = 1; i < n; i++) { + i32_to_f128M(i, &i128); + f128M_mul(&i128, &interval128, &((float128_t*)x_bytes128)[n-i]); + f128M_add(&a128, &((float128_t*)x_bytes128)[n-i], &((float128_t*)x_bytes128)[n-i]); + } + ((float128_t*)x_bytes128)[n] = b128; + x_bytes128[(n+1)*16] = 1; // pin head + r_data = u3i_bytes(((n+1)*16+1)*sizeof(c3_y), x_bytes128); + u3a_free(x_bytes128); + break; + } + + return r_data; + } + /* trace - tr(x) */ u3_noun @@ -1124,19 +1566,129 @@ return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); - default: - u3a_free(x_bytes); - u3a_free(y_bytes); - u3a_free(c_bytes); - - return u3_none; + default: + u3a_free(x_bytes); + u3a_free(y_bytes); + u3a_free(c_bytes); + + return u3_none; + } + } + + u3_noun + u3wf_la_add(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp, + y_shape, y_bloq, y_kind, y_fxp, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + y_shape = u3h(y_meta); // 2 + y_bloq = u3h(u3t(y_meta)); // 6 + y_kind = u3h(u3t(u3t(y_meta))); // 14 + y_fxp = u3t(u3t(u3t(y_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(y_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(y_kind) || + c3n == u3r_sing(x_shape, y_shape) || + c3n == u3r_sing(x_bloq, y_bloq) || + c3n == u3r_sing(x_kind, y_kind) + // fxp does not need to match here so no check + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_add_real(x_data, y_data, x_shape, x_bloq); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_sub(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp, + y_shape, y_bloq, y_kind, y_fxp, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + y_shape = u3h(y_meta); // 2 + y_bloq = u3h(u3t(y_meta)); // 6 + y_kind = u3h(u3t(u3t(y_meta))); // 14 + y_fxp = u3t(u3t(u3t(y_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(y_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(y_kind) || + c3n == u3r_sing(x_shape, y_shape) || + c3n == u3r_sing(x_bloq, y_bloq) || + c3n == u3r_sing(x_kind, y_kind) + // fxp does not need to match here so no check + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_sub_real(x_data, y_data, x_shape, x_bloq); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + + default: + return u3_none; + } + } } } u3_noun - u3wf_la_add(u3_noun cor) + u3wf_la_mul(u3_noun cor) { - // Each argument is a ray, [=meta data=@ux] + // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, y_meta, y_data; @@ -1178,7 +1730,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_add_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = u3qf_la_mul_real(x_data, y_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -1189,7 +1741,7 @@ } u3_noun - u3wf_la_sub(u3_noun cor) + u3wf_la_div(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, @@ -1233,7 +1785,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_sub_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = u3qf_la_div_real(x_data, y_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -1244,7 +1796,7 @@ } u3_noun - u3wf_la_mul(u3_noun cor) + u3wf_la_mod(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, @@ -1288,7 +1840,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_mul_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = u3qf_la_mod_real(x_data, y_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -1299,9 +1851,47 @@ } u3_noun - u3wf_la_div(u3_noun cor) + u3wf_la_abs(u3_noun cor) { - // Each argument is a ray, [=meta data=@ux] + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__real: ; + u3_noun r_data = u3qf_la_abs_real(x_data, x_shape, x_bloq); + fprintf(stderr, "abs: %lx\n", x_bloq); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_gth(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, y_meta, y_data; @@ -1327,23 +1917,21 @@ y_bloq = u3h(u3t(y_meta)); // 6 y_kind = u3h(u3t(u3t(y_meta))); // 14 y_fxp = u3t(u3t(u3t(y_meta))); // 15 - rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == u3ud(x_bloq) || c3n == u3ud(y_bloq) || c3n == u3ud(x_kind) || c3n == u3ud(y_kind) || c3n == u3r_sing(x_shape, y_shape) || c3n == u3r_sing(x_bloq, y_bloq) || - c3n == u3r_sing(x_kind, y_kind) - // fxp does not need to match here so no check + c3n == u3r_sing(x_kind, y_kind) || + c3n == u3r_sing(x_fxp, y_fxp) ) { return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: - _set_rounding(rnd); - u3_noun r_data = u3qf_la_div_real(x_data, y_data, x_shape, x_bloq); + case c3__real: ; + u3_noun r_data = u3qf_la_gth_real(x_data, y_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -1354,9 +1942,9 @@ } u3_noun - u3wf_la_mod(u3_noun cor) + u3wf_la_gte(u3_noun cor) { - // Each argument is a ray, [=meta data=@ux] + // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, y_meta, y_data; @@ -1382,23 +1970,127 @@ y_bloq = u3h(u3t(y_meta)); // 6 y_kind = u3h(u3t(u3t(y_meta))); // 14 y_fxp = u3t(u3t(u3t(y_meta))); // 15 - rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == u3ud(x_bloq) || c3n == u3ud(y_bloq) || c3n == u3ud(x_kind) || c3n == u3ud(y_kind) || c3n == u3r_sing(x_shape, y_shape) || c3n == u3r_sing(x_bloq, y_bloq) || - c3n == u3r_sing(x_kind, y_kind) - // fxp does not need to match here so no check + c3n == u3r_sing(x_kind, y_kind) || + c3n == u3r_sing(x_fxp, y_fxp) ) { return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: - _set_rounding(rnd); - u3_noun r_data = u3qf_la_mod_real(x_data, y_data, x_shape, x_bloq); + case c3__real: ; + u3_noun r_data = u3qf_la_gte_real(x_data, y_data, x_shape, x_bloq); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_lth(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp, + y_shape, y_bloq, y_kind, y_fxp, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + y_shape = u3h(y_meta); // 2 + y_bloq = u3h(u3t(y_meta)); // 6 + y_kind = u3h(u3t(u3t(y_meta))); // 14 + y_fxp = u3t(u3t(u3t(y_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(y_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(y_kind) || + c3n == u3r_sing(x_shape, y_shape) || + c3n == u3r_sing(x_bloq, y_bloq) || + c3n == u3r_sing(x_kind, y_kind) || + c3n == u3r_sing(x_fxp, y_fxp) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__real: ; + u3_noun r_data = u3qf_la_lth_real(x_data, y_data, x_shape, x_bloq); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_lte(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp, + y_shape, y_bloq, y_kind, y_fxp, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + y_shape = u3h(y_meta); // 2 + y_bloq = u3h(u3t(y_meta)); // 6 + y_kind = u3h(u3t(u3t(y_meta))); // 14 + y_fxp = u3t(u3t(u3t(y_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(y_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(y_kind) || + c3n == u3r_sing(x_shape, y_shape) || + c3n == u3r_sing(x_bloq, y_bloq) || + c3n == u3r_sing(x_kind, y_kind) || + c3n == u3r_sing(x_fxp, y_fxp) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__real: ; + u3_noun r_data = u3qf_la_lte_real(x_data, y_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -1640,7 +2332,7 @@ } u3_noun - u3wf_la_diag(u3_noun cor) + u3wf_la_transpose(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data; @@ -1664,15 +2356,52 @@ { return u3m_bail(c3__exit); } else { - u3_noun r_data = u3qf_la_diag(x_data, x_shape, x_bloq); - c3_d len_x0 = _get_dims(x_shape)[0]; - return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + u3_noun r_data = u3qf_la_transpose(x_data, x_shape, x_bloq); + return u3nc(u3nq(u3nt(u3k(u3h(x_shape)), u3k(u3h(u3t(x_shape))), u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); } } } u3_noun - u3wf_la_transpose(u3_noun cor) + u3wf_la_linspace(u3_noun cor) + { + u3_noun x_meta, a, b, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_12, &a, + u3x_sam_13, &b, + u3x_sam_7, &n, + 0)) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__real: ; + u3_noun r_data = u3qf_la_linspace_real(a, b, n, x_bloq); + x_shape = u3nc(u3x_atom(n)+1, u3_nul); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_diag(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data; @@ -1696,8 +2425,9 @@ { return u3m_bail(c3__exit); } else { - u3_noun r_data = u3qf_la_transpose(x_data, x_shape, x_bloq); - return u3nc(u3nq(u3nt(u3k(u3h(x_shape)), u3k(u3h(u3t(x_shape))), u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + u3_noun r_data = u3qf_la_diag(x_data, x_shape, x_bloq); + c3_d len_x0 = _get_dims(x_shape)[0]; + return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); } } } diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index 37f4ccc127..de853bfe94 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -260,6 +260,15 @@ u3_noun u3qf_la_dot_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_diag(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_transpose(u3_noun, u3_noun, u3_noun); + + u3_noun u3qf_la_linspace_real(u3_noun, u3_noun, u3_noun, u3_noun); + + u3_noun u3qf_la_abs_real(u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_gth_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_gte_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_lth_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_lte_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_trace_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_mmul_real(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index db84df3780..6252c2fa63 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2163,6 +2163,21 @@ static u3j_harm _139_hex__lagoon_divs_a[] = {{".2", u3wf_la_divs}, {}}; static u3j_harm _139_hex__lagoon_mods_a[] = {{".2", u3wf_la_mods}, {}}; static u3j_harm _139_hex__lagoon_dot_a[] = {{".2", u3wf_la_dot}, {}}; static u3j_harm _139_hex__lagoon_trans_a[] ={{".2", u3wf_la_transpose}, {}}; +// static u3j_harm _139_hex__lagoon_stack_a[] ={{".2", u3wf_la_stack}, {}}; +// static u3j_harm _139_hex__lagoon_cumsum_a[]={{".2", u3wf_la_cumsum}, {}}; +// static u3j_harm _139_hex__lagoon_argmin_a[]={{".2", u3wf_la_argmin}, {}}; +// static u3j_harm _139_hex__lagoon_argmax_a[]={{".2", u3wf_la_argmax}, {}}; +// static u3j_harm _139_hex__lagoon_ravel_a[]={{".2", u3wf_la_ravel}, {}}; +// static u3j_harm _139_hex__lagoon_min_a[]={{".2", u3wf_la_min}, {}}; +// static u3j_harm _139_hex__lagoon_max_a[]={{".2", u3wf_la_max}, {}}; +static u3j_harm _139_hex__lagoon_linspace_a[]={{".2", u3wf_la_linspace}, {}}; +// static u3j_harm _139_hex__lagoon_range_a[]={{".2", u3wf_la_range}, {}}; +// static u3j_harm _139_hex__lagoon_submatrix_a[]={{".2", u3wf_la_submatrix}, {}}; +static u3j_harm _139_hex__lagoon_abs_a[]={{".2", u3wf_la_abs}, {}}; +static u3j_harm _139_hex__lagoon_gth_a[]={{".2", u3wf_la_gth}, {}}; +static u3j_harm _139_hex__lagoon_gte_a[]={{".2", u3wf_la_gte}, {}}; +static u3j_harm _139_hex__lagoon_lth_a[]={{".2", u3wf_la_lth}, {}}; +static u3j_harm _139_hex__lagoon_lte_a[]={{".2", u3wf_la_lte}, {}}; static u3j_harm _139_hex__lagoon_diag_a[] = {{".2", u3wf_la_diag}, {}}; static u3j_harm _139_hex__lagoon_trace_a[]= {{".2", u3wf_la_trace}, {}}; static u3j_harm _139_hex__lagoon_mmul_a[] = {{".2", u3wf_la_mmul}, {}}; @@ -2179,6 +2194,21 @@ static u3j_core _139_hex__la_core_d[] = { "mod-scal", 7, _139_hex__lagoon_mods_a, 0, no_hashes }, { "dot", 7, _139_hex__lagoon_dot_a, 0, no_hashes }, { "transpose",7, _139_hex__lagoon_trans_a, 0, no_hashes }, + // { "stack", 7, _139_hex__lagoon_stack_a, 0, no_hashes }, + // { "cumsum", 7, _139_hex__lagoon_cumsum_a, 0, no_hashes }, + // { "argmin", 7, _139_hex__lagoon_argmin_a, 0, no_hashes }, + // { "argmax", 7, _139_hex__lagoon_argmax_a, 0, no_hashes }, + // { "ravel", 7, _139_hex__lagoon_ravel_a, 0, no_hashes }, + // { "min", 7, _139_hex__lagoon_min_a, 0, no_hashes }, + // { "max", 7, _139_hex__lagoon_max_a, 0, no_hashes }, + { "linspace", 7, _139_hex__lagoon_linspace_a, 0, no_hashes }, + // { "range", 7, _139_hex__lagoon_range_a, 0, no_hashes }, + // { "submatrix",7, _139_hex__lagoon_submatrix_a, 0, no_hashes }, + { "abs", 7, _139_hex__lagoon_abs_a, 0, no_hashes }, + { "gth", 7, _139_hex__lagoon_gth_a, 0, no_hashes }, + { "gte", 7, _139_hex__lagoon_gte_a, 0, no_hashes }, + { "lth", 7, _139_hex__lagoon_lth_a, 0, no_hashes }, + { "lte", 7, _139_hex__lagoon_lte_a, 0, no_hashes }, { "diag", 7, _139_hex__lagoon_diag_a, 0, no_hashes }, { "trace", 7, _139_hex__lagoon_trace_a,0, no_hashes }, { "mmul", 7, _139_hex__lagoon_mmul_a, 0, no_hashes }, diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index 36d66c21ba..6ab0518f67 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -344,6 +344,15 @@ u3_noun u3wf_la_dot(u3_noun); u3_noun u3wf_la_diag(u3_noun); u3_noun u3wf_la_transpose(u3_noun); + + u3_noun u3wf_la_linspace(u3_noun); + + u3_noun u3wf_la_abs(u3_noun); + u3_noun u3wf_la_gth(u3_noun); + u3_noun u3wf_la_gte(u3_noun); + u3_noun u3wf_la_lth(u3_noun); + u3_noun u3wf_la_lte(u3_noun); + u3_noun u3wf_la_trace(u3_noun); u3_noun u3wf_la_mmul(u3_noun); From 118b28103bd04b60e90a1460fe019b329fe8f589 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Tue, 16 Apr 2024 20:35:17 -0500 Subject: [PATCH 17/97] Add min/max jets. --- WORKSPACE.bazel | 2 +- pkg/noun/jets/f/lagoon.c | 230 +++++++++++++++++++++++++++++++++++++++ pkg/noun/jets/q.h | 2 + pkg/noun/jets/tree.c | 8 +- pkg/noun/jets/w.h | 2 + 5 files changed, 239 insertions(+), 5 deletions(-) diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index de3564b253..11cb3c7f58 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -297,7 +297,7 @@ versioned_http_archive( strip_prefix = "SoftBLAS-{version}", # sha256 = "", url = "https://github.com/urbit/SoftBLAS/archive/{version}.tar.gz", - version = "3af44d8cbf0d61e31946af9127099257160d0451", + version = "afeccbabaf43b7d0fde6f3d2809b9c811b91641e", ) versioned_http_archive( diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index 7ffc7fd7ad..aec62641ba 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -456,6 +456,162 @@ return r_data; } +/* min - min(x,y) +*/ + u3_noun + u3qf_la_min_real(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by for range) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + + u3_noun r_data; + + // Switch on the block size. + switch (bloq) { + case 4: ; + float16_t min_val16 = ((float16_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + min_val16 = f16_min(min_val16, ((float16_t*)x_bytes)[i]); + } + float16_t r16[2]; + r16[0] = min_val16; + r16[1].v = 0x1; + r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)r16); + break; + + case 5: ; + float32_t min_val32 = ((float32_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + min_val32 = f32_min(min_val32, ((float32_t*)x_bytes)[i]); + } + float32_t r32[2]; + r32[0] = min_val32; + r32[1].v = 0x1; + r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)r32); + break; + + case 6: ; + float64_t min_val64 = ((float64_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + min_val64 = f64_min(min_val64, ((float64_t*)x_bytes)[i]); + } + float64_t r64[2]; + r64[0] = min_val64; + r64[1].v = 0x1; + r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)r64); + break; + + case 7: ; + float128_t min_val128 = ((float128_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + min_val128 = *f128M_min(&min_val128, &((float128_t*)x_bytes)[i]); + } + float128_t r128[2]; + r128[0] = min_val128; + r128[1] = (float128_t){0x1, 0x0}; + r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)r128); + break; + } + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* max - max(x,y) +*/ + u3_noun + u3qf_la_max_real(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by for range) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + + u3_noun r_data; + + // Switch on the block size. + switch (bloq) { + case 4: ; + float16_t max_val16 = ((float16_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + max_val16 = f16_max(max_val16, ((float16_t*)x_bytes)[i]); + } + float16_t r16[2]; + r16[0] = max_val16; + r16[1].v = 0x1; + r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)r16); + break; + + case 5: ; + float32_t max_val32 = ((float32_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + max_val32 = f32_max(max_val32, ((float32_t*)x_bytes)[i]); + } + float32_t r32[2]; + r32[0] = max_val32; + r32[1].v = 0x1; + r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)r32); + break; + + case 6: ; + float64_t max_val64 = ((float64_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + max_val64 = f64_max(max_val64, ((float64_t*)x_bytes)[i]); + } + float64_t r64[2]; + r64[0] = max_val64; + r64[1].v = 0x1; + r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)r64); + break; + + case 7: ; + float128_t max_val128 = ((float128_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + max_val128 = *f128M_max(&max_val128, &((float128_t*)x_bytes)[i]); + } + float128_t r128[2]; + r128[0] = max_val128; + r128[1] = (float128_t){0x1, 0x0}; + r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)r128); + break; + } + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + /* abs - |x| */ u3_noun @@ -1850,6 +2006,80 @@ } } + u3_noun + u3wf_la_min(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__real: ; + u3_noun r_data = u3qf_la_min_real(x_data, x_shape, x_bloq); + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_max(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__real: ; + u3_noun r_data = u3qf_la_max_real(x_data, x_shape, x_bloq); + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + + default: + return u3_none; + } + } + } + } + u3_noun u3wf_la_abs(u3_noun cor) { diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index de853bfe94..17fdb2a646 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -261,6 +261,8 @@ u3_noun u3qf_la_diag(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_transpose(u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_min_real(u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_max_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_linspace_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_abs_real(u3_noun, u3_noun, u3_noun); diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index 6252c2fa63..bc482ced0a 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2168,8 +2168,8 @@ static u3j_harm _139_hex__lagoon_trans_a[] ={{".2", u3wf_la_transpose}, {}}; // static u3j_harm _139_hex__lagoon_argmin_a[]={{".2", u3wf_la_argmin}, {}}; // static u3j_harm _139_hex__lagoon_argmax_a[]={{".2", u3wf_la_argmax}, {}}; // static u3j_harm _139_hex__lagoon_ravel_a[]={{".2", u3wf_la_ravel}, {}}; -// static u3j_harm _139_hex__lagoon_min_a[]={{".2", u3wf_la_min}, {}}; -// static u3j_harm _139_hex__lagoon_max_a[]={{".2", u3wf_la_max}, {}}; +static u3j_harm _139_hex__lagoon_min_a[]={{".2", u3wf_la_min}, {}}; +static u3j_harm _139_hex__lagoon_max_a[]={{".2", u3wf_la_max}, {}}; static u3j_harm _139_hex__lagoon_linspace_a[]={{".2", u3wf_la_linspace}, {}}; // static u3j_harm _139_hex__lagoon_range_a[]={{".2", u3wf_la_range}, {}}; // static u3j_harm _139_hex__lagoon_submatrix_a[]={{".2", u3wf_la_submatrix}, {}}; @@ -2199,8 +2199,8 @@ static u3j_core _139_hex__la_core_d[] = // { "argmin", 7, _139_hex__lagoon_argmin_a, 0, no_hashes }, // { "argmax", 7, _139_hex__lagoon_argmax_a, 0, no_hashes }, // { "ravel", 7, _139_hex__lagoon_ravel_a, 0, no_hashes }, - // { "min", 7, _139_hex__lagoon_min_a, 0, no_hashes }, - // { "max", 7, _139_hex__lagoon_max_a, 0, no_hashes }, + { "min", 7, _139_hex__lagoon_min_a, 0, no_hashes }, + { "max", 7, _139_hex__lagoon_max_a, 0, no_hashes }, { "linspace", 7, _139_hex__lagoon_linspace_a, 0, no_hashes }, // { "range", 7, _139_hex__lagoon_range_a, 0, no_hashes }, // { "submatrix",7, _139_hex__lagoon_submatrix_a, 0, no_hashes }, diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index 6ab0518f67..1e250bb79c 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -345,6 +345,8 @@ u3_noun u3wf_la_diag(u3_noun); u3_noun u3wf_la_transpose(u3_noun); + u3_noun u3wf_la_min(u3_noun); + u3_noun u3wf_la_max(u3_noun); u3_noun u3wf_la_linspace(u3_noun); u3_noun u3wf_la_abs(u3_noun); From c617a8297be21c9a0cf2f70041527cf3268a5039 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 17 Apr 2024 09:58:33 -0500 Subject: [PATCH 18/97] Add min/max arg jets. --- WORKSPACE.bazel | 2 +- pkg/noun/jets/f/lagoon.c | 222 +++++++++++++++++++++++++++++++++++++++ pkg/noun/jets/q.h | 3 + pkg/noun/jets/tree.c | 8 +- pkg/noun/jets/w.h | 3 + 5 files changed, 233 insertions(+), 5 deletions(-) diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index 11cb3c7f58..c904a63eff 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -297,7 +297,7 @@ versioned_http_archive( strip_prefix = "SoftBLAS-{version}", # sha256 = "", url = "https://github.com/urbit/SoftBLAS/archive/{version}.tar.gz", - version = "afeccbabaf43b7d0fde6f3d2809b9c811b91641e", + version = "7d05697aea5363dcf5f877a9c8b464e9c352d3d4", ) versioned_http_archive( diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index aec62641ba..4f724d4514 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -456,6 +456,152 @@ return r_data; } +/* argmin - argmin(x) +*/ + u3_noun + u3qf_la_argmin_real(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1, which doesn't matter here) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + c3_w min_idx = 0; + + // Switch on the block size. + switch (bloq) { + case 4: ; + float16_t min_val16 = ((float16_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f16_lt(((float16_t*)x_bytes)[i], min_val16)) { + min_val16 = ((float16_t*)x_bytes)[i]; + min_idx = (len_x - i); + } + } + break; + + case 5: ; + float32_t min_val32 = ((float32_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f32_lt(((float32_t*)x_bytes)[i], min_val32)) { + min_val32 = ((float32_t*)x_bytes)[i]; + min_idx = (len_x - i); + } + } + break; + + case 6: ; + float64_t min_val64 = ((float64_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f64_lt(((float64_t*)x_bytes)[i], min_val64)) { + min_val64 = ((float64_t*)x_bytes)[i]; + min_idx = (len_x - i); + } + } + break; + + case 7: ; + float128_t min_val128 = ((float128_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f128_lt(((float128_t*)x_bytes)[i], min_val128)) { + min_val128 = *f128M_min(&min_val128, &((float128_t*)x_bytes)[i]); + min_idx = (len_x - i); + } + } + break; + } + + u3_noun r_data = u3i_chub(min_idx); + + return r_data; + } + +/* argmax - argmax(x) +*/ + u3_noun + u3qf_la_argmax_real(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1, which doesn't matter here) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + c3_w max_idx = 0; + + // Switch on the block size. + switch (bloq) { + case 4: ; + float16_t max_val16 = ((float16_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f16_gt(((float16_t*)x_bytes)[i], max_val16)) { + max_val16 = ((float16_t*)x_bytes)[i]; + max_idx = (len_x - i); + } + } + break; + + case 5: ; + float32_t max_val32 = ((float32_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f32_gt(((float32_t*)x_bytes)[i], max_val32)) { + max_val32 = ((float32_t*)x_bytes)[i]; + max_idx = (len_x - i); + } + } + break; + + case 6: ; + float64_t max_val64 = ((float64_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f64_gt(((float64_t*)x_bytes)[i], max_val64)) { + max_val64 = ((float64_t*)x_bytes)[i]; + max_idx = (len_x - i); + } + } + break; + + case 7: ; + float128_t max_val128 = ((float128_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f128_gt(((float128_t*)x_bytes)[i], max_val128)) { + max_val128 = *f128M_max(&max_val128, &((float128_t*)x_bytes)[i]); + max_idx = (len_x - i); + } + } + break; + } + + u3_noun r_data = u3i_chub(max_idx); + + return r_data; + } + /* min - min(x,y) */ u3_noun @@ -2006,6 +2152,82 @@ } } + u3_noun + u3wf_la_argmin(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__real: ; + u3_noun r_data = u3qf_la_argmin_real(x_data, x_shape, x_bloq); + // bare atom (@ index) + return r_data; + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_argmax(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__real: ; + u3_noun r_data = u3qf_la_argmax_real(x_data, x_shape, x_bloq); + // bare atom (@ index) + return r_data; + + default: + return u3_none; + } + } + } + } + u3_noun u3wf_la_min(u3_noun cor) { diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index 17fdb2a646..cf25093573 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -261,6 +261,9 @@ u3_noun u3qf_la_diag(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_transpose(u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_argmin_real(u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_argmax_real(u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_min_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_max_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_linspace_real(u3_noun, u3_noun, u3_noun, u3_noun); diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index bc482ced0a..1b876597fc 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2165,8 +2165,8 @@ static u3j_harm _139_hex__lagoon_dot_a[] = {{".2", u3wf_la_dot}, {}}; static u3j_harm _139_hex__lagoon_trans_a[] ={{".2", u3wf_la_transpose}, {}}; // static u3j_harm _139_hex__lagoon_stack_a[] ={{".2", u3wf_la_stack}, {}}; // static u3j_harm _139_hex__lagoon_cumsum_a[]={{".2", u3wf_la_cumsum}, {}}; -// static u3j_harm _139_hex__lagoon_argmin_a[]={{".2", u3wf_la_argmin}, {}}; -// static u3j_harm _139_hex__lagoon_argmax_a[]={{".2", u3wf_la_argmax}, {}}; +static u3j_harm _139_hex__lagoon_argmin_a[]={{".2", u3wf_la_argmin}, {}}; +static u3j_harm _139_hex__lagoon_argmax_a[]={{".2", u3wf_la_argmax}, {}}; // static u3j_harm _139_hex__lagoon_ravel_a[]={{".2", u3wf_la_ravel}, {}}; static u3j_harm _139_hex__lagoon_min_a[]={{".2", u3wf_la_min}, {}}; static u3j_harm _139_hex__lagoon_max_a[]={{".2", u3wf_la_max}, {}}; @@ -2196,8 +2196,8 @@ static u3j_core _139_hex__la_core_d[] = { "transpose",7, _139_hex__lagoon_trans_a, 0, no_hashes }, // { "stack", 7, _139_hex__lagoon_stack_a, 0, no_hashes }, // { "cumsum", 7, _139_hex__lagoon_cumsum_a, 0, no_hashes }, - // { "argmin", 7, _139_hex__lagoon_argmin_a, 0, no_hashes }, - // { "argmax", 7, _139_hex__lagoon_argmax_a, 0, no_hashes }, + { "argmin", 7, _139_hex__lagoon_argmin_a, 0, no_hashes }, + { "argmax", 7, _139_hex__lagoon_argmax_a, 0, no_hashes }, // { "ravel", 7, _139_hex__lagoon_ravel_a, 0, no_hashes }, { "min", 7, _139_hex__lagoon_min_a, 0, no_hashes }, { "max", 7, _139_hex__lagoon_max_a, 0, no_hashes }, diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index 1e250bb79c..d8b887c896 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -345,6 +345,9 @@ u3_noun u3wf_la_diag(u3_noun); u3_noun u3wf_la_transpose(u3_noun); + u3_noun u3wf_la_argmin(u3_noun); + u3_noun u3wf_la_argmax(u3_noun); + u3_noun u3wf_la_min(u3_noun); u3_noun u3wf_la_max(u3_noun); u3_noun u3wf_la_linspace(u3_noun); From 44f1013fc460c3fbe426297a1a98f92e5c892ef8 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 17 Apr 2024 10:17:38 -0500 Subject: [PATCH 19/97] Add cumsum jet. --- pkg/noun/jets/f/lagoon.c | 126 ++++++++++++++++++++++++++++++++++++--- pkg/noun/jets/q.h | 1 + pkg/noun/jets/tree.c | 2 +- pkg/noun/jets/w.h | 1 + 4 files changed, 121 insertions(+), 9 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index 4f724d4514..4235daec55 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -456,6 +456,76 @@ return r_data; } +/* cumsum - x[0] + x[1] + ... x[n] +*/ + u3_noun + u3qf_la_cumsum_real(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // y_bytes is the data array (w/ leading 0x1, skipped by for range) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + + u3_noun r_data; + + // Switch on the block size. + switch (bloq) { + case 4: ; + float16_t sum16[2]; + for (c3_d i = 0; i < len_x; i++) { + sum16[0] = f16_add(sum16[0], ((float16_t*)x_bytes)[i]); + } + sum16[1].v = 0x1; + r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)sum16); + break; + + case 5: ; + float32_t sum32[2]; + for (c3_d i = 0; i < len_x; i++) { + sum32[0] = f32_add(sum32[0], ((float32_t*)x_bytes)[i]); + } + sum32[1].v = 0x1; + r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)sum32); + break; + + case 6: ; + float64_t sum64[2]; + for (c3_d i = 0; i < len_x; i++) { + sum64[0] = f64_add(sum64[0], ((float64_t*)x_bytes)[i]); + } + sum64[1].v = 0x1; + r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)sum64); + break; + + case 7: ; + float128_t sum128[2]; + for (c3_d i = 0; i < len_x; i++) { + f128M_add(&(sum128[0]), &(((float128_t*)x_bytes)[i]), &(sum128[0])); + } + sum128[1] = (float128_t){0x1, 0x0}; + r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)sum128); + break; + } + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + /* argmin - argmin(x) */ u3_noun @@ -488,7 +558,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f16_lt(((float16_t*)x_bytes)[i], min_val16)) { min_val16 = ((float16_t*)x_bytes)[i]; - min_idx = (len_x - i); + min_idx = (len_x - i - 1); } } break; @@ -498,7 +568,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f32_lt(((float32_t*)x_bytes)[i], min_val32)) { min_val32 = ((float32_t*)x_bytes)[i]; - min_idx = (len_x - i); + min_idx = (len_x - i - 1); } } break; @@ -508,7 +578,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f64_lt(((float64_t*)x_bytes)[i], min_val64)) { min_val64 = ((float64_t*)x_bytes)[i]; - min_idx = (len_x - i); + min_idx = (len_x - i - 1); } } break; @@ -518,7 +588,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f128_lt(((float128_t*)x_bytes)[i], min_val128)) { min_val128 = *f128M_min(&min_val128, &((float128_t*)x_bytes)[i]); - min_idx = (len_x - i); + min_idx = (len_x - i - 1); } } break; @@ -561,7 +631,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f16_gt(((float16_t*)x_bytes)[i], max_val16)) { max_val16 = ((float16_t*)x_bytes)[i]; - max_idx = (len_x - i); + max_idx = (len_x - i - 1); } } break; @@ -571,7 +641,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f32_gt(((float32_t*)x_bytes)[i], max_val32)) { max_val32 = ((float32_t*)x_bytes)[i]; - max_idx = (len_x - i); + max_idx = (len_x - i - 1); } } break; @@ -581,7 +651,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f64_gt(((float64_t*)x_bytes)[i], max_val64)) { max_val64 = ((float64_t*)x_bytes)[i]; - max_idx = (len_x - i); + max_idx = (len_x - i - 1); } } break; @@ -591,7 +661,7 @@ for (c3_d i = 0; i < len_x; i++) { if(f128_gt(((float128_t*)x_bytes)[i], max_val128)) { max_val128 = *f128M_max(&max_val128, &((float128_t*)x_bytes)[i]); - max_idx = (len_x - i); + max_idx = (len_x - i - 1); } } break; @@ -2152,6 +2222,46 @@ } } + u3_noun + u3wf_la_cumsum(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_cumsum_real(x_data, x_shape, x_bloq); + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + + default: + return u3_none; + } + } + } + } + u3_noun u3wf_la_argmin(u3_noun cor) { diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index cf25093573..34a6b7dca7 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -261,6 +261,7 @@ u3_noun u3qf_la_diag(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_transpose(u3_noun, u3_noun, u3_noun); + u3_noun u3qf_la_cumsum_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_argmin_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_argmax_real(u3_noun, u3_noun, u3_noun); diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index 1b876597fc..1738f75339 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2164,7 +2164,7 @@ static u3j_harm _139_hex__lagoon_mods_a[] = {{".2", u3wf_la_mods}, {}}; static u3j_harm _139_hex__lagoon_dot_a[] = {{".2", u3wf_la_dot}, {}}; static u3j_harm _139_hex__lagoon_trans_a[] ={{".2", u3wf_la_transpose}, {}}; // static u3j_harm _139_hex__lagoon_stack_a[] ={{".2", u3wf_la_stack}, {}}; -// static u3j_harm _139_hex__lagoon_cumsum_a[]={{".2", u3wf_la_cumsum}, {}}; +static u3j_harm _139_hex__lagoon_cumsum_a[]={{".2", u3wf_la_cumsum}, {}}; static u3j_harm _139_hex__lagoon_argmin_a[]={{".2", u3wf_la_argmin}, {}}; static u3j_harm _139_hex__lagoon_argmax_a[]={{".2", u3wf_la_argmax}, {}}; // static u3j_harm _139_hex__lagoon_ravel_a[]={{".2", u3wf_la_ravel}, {}}; diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index d8b887c896..2b4626f324 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -345,6 +345,7 @@ u3_noun u3wf_la_diag(u3_noun); u3_noun u3wf_la_transpose(u3_noun); + u3_noun u3wf_la_cumsum(u3_noun); u3_noun u3wf_la_argmin(u3_noun); u3_noun u3wf_la_argmax(u3_noun); From c1f651d85ff5e3e24480cf210e87d1c2e6d34235 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 17 Apr 2024 10:41:24 -0500 Subject: [PATCH 20/97] Add ravel jet. --- pkg/noun/jets/f/lagoon.c | 102 +++++++++++++++++++++++++++++++++++++++ pkg/noun/jets/q.h | 2 +- pkg/noun/jets/tree.c | 2 +- pkg/noun/jets/w.h | 2 +- 4 files changed, 105 insertions(+), 3 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index 4235daec55..09962d0a8f 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -672,6 +672,71 @@ return r_data; } +/* ravel - x -> ~[x[0], x[1], ... x[n]] + entire nd-array busted out as a linear list +*/ + u3_noun + u3qf_la_ravel_real(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // r_data is the result noun of [data] + u3_noun r_data; + + // Switch on the block size. + switch (bloq) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + r_data = u3nc(u3i_word(x_val16.v), r_data); + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + r_data = u3nc(u3i_word(x_val32.v), r_data); + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + r_data = u3nc(u3i_chub(x_val64.v), r_data); + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + r_data = u3nc(u3i_chubs(2, (c3_d*)&(x_val128.v)), r_data); + } + break; + } + + // Clean up and return. + u3a_free(x_bytes); + + // return u3qb_flop(r_data); + return r_data; + } + /* min - min(x,y) */ u3_noun @@ -2300,6 +2365,43 @@ } } + u3_noun + u3wf_la_ravel(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__real: ; + u3_noun r_data = u3qf_la_ravel_real(x_data, x_shape, x_bloq); + // (list @) + return r_data; + + default: + return u3_none; + } + } + } + } + u3_noun u3wf_la_argmax(u3_noun cor) { diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index 34a6b7dca7..0f38bb3a96 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -264,7 +264,7 @@ u3_noun u3qf_la_cumsum_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_argmin_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_argmax_real(u3_noun, u3_noun, u3_noun); - + u3_noun u3qf_la_ravel_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_min_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_max_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_linspace_real(u3_noun, u3_noun, u3_noun, u3_noun); diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index 1738f75339..9dea54bcfc 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2167,7 +2167,7 @@ static u3j_harm _139_hex__lagoon_trans_a[] ={{".2", u3wf_la_transpose}, {}}; static u3j_harm _139_hex__lagoon_cumsum_a[]={{".2", u3wf_la_cumsum}, {}}; static u3j_harm _139_hex__lagoon_argmin_a[]={{".2", u3wf_la_argmin}, {}}; static u3j_harm _139_hex__lagoon_argmax_a[]={{".2", u3wf_la_argmax}, {}}; -// static u3j_harm _139_hex__lagoon_ravel_a[]={{".2", u3wf_la_ravel}, {}}; +static u3j_harm _139_hex__lagoon_ravel_a[]={{".2", u3wf_la_ravel}, {}}; static u3j_harm _139_hex__lagoon_min_a[]={{".2", u3wf_la_min}, {}}; static u3j_harm _139_hex__lagoon_max_a[]={{".2", u3wf_la_max}, {}}; static u3j_harm _139_hex__lagoon_linspace_a[]={{".2", u3wf_la_linspace}, {}}; diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index 2b4626f324..666b611bc5 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -348,7 +348,7 @@ u3_noun u3wf_la_cumsum(u3_noun); u3_noun u3wf_la_argmin(u3_noun); u3_noun u3wf_la_argmax(u3_noun); - + u3_noun u3wf_la_ravel(u3_noun); u3_noun u3wf_la_min(u3_noun); u3_noun u3wf_la_max(u3_noun); u3_noun u3wf_la_linspace(u3_noun); From 692b58d19b2ae8ba4e74489d2a8df8e562c26717 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 17 Apr 2024 13:41:40 -0500 Subject: [PATCH 21/97] Post "final" jet roster. --- pkg/noun/jets/f/lagoon.c | 210 +++++++++++++++++++++++++++++++++++---- pkg/noun/jets/q.h | 4 +- pkg/noun/jets/tree.c | 12 +-- pkg/noun/jets/w.h | 3 +- 4 files changed, 197 insertions(+), 32 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index 09962d0a8f..8dc6144d2e 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -7,7 +7,6 @@ #include "softfloat.h" #include "softblas.h" -#include #include // for pow() #include @@ -733,7 +732,6 @@ // Clean up and return. u3a_free(x_bytes); - // return u3qb_flop(r_data); return r_data; } @@ -1739,10 +1737,14 @@ return u3_none; } // Assert length of dims is 2. - assert(u3qb_lent(shape) == 2); + if (u3qb_lent(shape) != 2) { + return u3m_bail(c3__exit); + } // Unpack shape into an array of dimensions. c3_d *dims = _get_dims(shape); - assert(dims[0] == dims[1]); + if (dims[0] != dims[1]) { + return u3m_bail(c3__exit); + } // Unpack the data as a byte array. We assume total length < 2**64. c3_d len_x = _get_length(shape); @@ -1782,7 +1784,9 @@ u3_noun bloq) { // Assert length of dims is 2. - assert(u3qb_lent(shape) == 2); + if (u3qb_lent(shape) != 2) { + return u3m_bail(c3__exit); + } // Unpack shape into an array of dimensions. c3_d *dims = _get_dims(shape); @@ -1838,12 +1842,13 @@ u3r_bytes(0, 2, (c3_y*)&(a16.v), a); u3r_bytes(0, 2, (c3_y*)&(b16.v), b); float16_t span16 = f16_sub(b16, a16); - float16_t interval16 = f16_div(span16, i32_to_f16(n)); + float16_t interval16 = f16_div(span16, i32_to_f16(n-1)); c3_y* x_bytes16 = (c3_y*)u3a_malloc(((n+1)*2+1)*sizeof(c3_y)); for (c3_d i = 1; i <= n; i++) { ((float16_t*)x_bytes16)[n-i] = f16_add(a16, f16_mul(i32_to_f16(i), interval16)); } - ((float16_t*)x_bytes16)[n] = b16; + ((float16_t*)x_bytes16)[n] = a16; + ((float16_t*)x_bytes16)[0] = b16; x_bytes16[(n+1)*2] = 1; // pin head r_data = u3i_bytes(((n+1)*2+1)*sizeof(c3_y), x_bytes16); u3a_free(x_bytes16); @@ -1854,12 +1859,13 @@ u3r_bytes(0, 4, (c3_y*)&(a32.v), a); u3r_bytes(0, 4, (c3_y*)&(b32.v), b); float32_t span32 = f32_sub(b32, a32); - float32_t interval32 = f32_div(span32, i32_to_f32(n)); + float32_t interval32 = f32_div(span32, i32_to_f32(n-1)); c3_y* x_bytes32 = (c3_y*)u3a_malloc(((n+1)*4+1)*sizeof(c3_y)); for (c3_d i = 1; i <= n; i++) { ((float32_t*)x_bytes32)[n-i] = f32_add(a32, f32_mul(i32_to_f32(i), interval32)); } - ((float32_t*)x_bytes32)[n] = b32; + ((float32_t*)x_bytes32)[n] = a32; + ((float32_t*)x_bytes32)[0] = b32; x_bytes32[(n+1)*4] = 1; // pin head r_data = u3i_bytes(((n+1)*4+1)*sizeof(c3_y), x_bytes32); u3a_free(x_bytes32); @@ -1870,12 +1876,13 @@ u3r_bytes(0, 8, (c3_y*)&(a64.v), a); u3r_bytes(0, 8, (c3_y*)&(b64.v), b); float64_t span64 = f64_sub(b64, a64); - float64_t interval64 = f64_div(span64, i32_to_f64(n)); + float64_t interval64 = f64_div(span64, i32_to_f64(n-1)); c3_y* x_bytes64 = (c3_y*)u3a_malloc(((n+1)*8+1)*sizeof(c3_y)); for (c3_d i = 1; i < n; i++) { ((float64_t*)x_bytes64)[n-i] = f64_add(a64, f64_mul(i32_to_f64(i), interval64)); } - ((float64_t*)x_bytes64)[n] = b64; + ((float64_t*)x_bytes64)[n] = a64; + ((float64_t*)x_bytes64)[0] = b64; x_bytes64[(n+1)*8] = 1; // pin head r_data = u3i_bytes(((n+1)*8+1)*sizeof(c3_y), x_bytes64); u3a_free(x_bytes64); @@ -1889,7 +1896,7 @@ f128M_sub(&b128, &a128, &span128); float128_t interval128; float128_t n128; - i32_to_f128M(n, &n128); + i32_to_f128M(n-1, &n128); f128M_div(&span128, &n128, &interval128); c3_y* x_bytes128 = (c3_y*)u3a_malloc(((n+1)*16+1)*sizeof(c3_y)); float128_t i128; @@ -1898,7 +1905,8 @@ f128M_mul(&i128, &interval128, &((float128_t*)x_bytes128)[n-i]); f128M_add(&a128, &((float128_t*)x_bytes128)[n-i], &((float128_t*)x_bytes128)[n-i]); } - ((float128_t*)x_bytes128)[n] = b128; + ((float128_t*)x_bytes128)[n] = a128; + ((float128_t*)x_bytes128)[0] = b128; x_bytes128[(n+1)*16] = 1; // pin head r_data = u3i_bytes(((n+1)*16+1)*sizeof(c3_y), x_bytes128); u3a_free(x_bytes128); @@ -1908,6 +1916,97 @@ return r_data; } +/* range - [a a+d ... b] +*/ + u3_noun + u3qf_la_range_real(u3_noun a, + u3_noun b, + u3_noun d, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + u3_noun r_data; + + switch (bloq) { + case 4: ; + float16_t a16, b16, interval16; + u3r_bytes(0, 2, (c3_y*)&(a16.v), a); + u3r_bytes(0, 2, (c3_y*)&(b16.v), b); + u3r_bytes(0, 2, (c3_y*)&(interval16.v), d); + c3_d n16 = f16_to_i64(f16_div(f16_sub(b16, a16), interval16), softfloat_round_minMag, false); + c3_y* x_bytes16 = (c3_y*)u3a_malloc(((n16+1)*2+1)*sizeof(c3_y)); + for (c3_d i = 1; i <= n16; i++) { + ((float16_t*)x_bytes16)[n16-i] = f16_add(a16, f16_mul(i32_to_f16(i), interval16)); + } + ((float16_t*)x_bytes16)[n16] = a16; + // ((float16_t*)x_bytes16)[0] = b16; + x_bytes16[(n16+1)*2] = 1; // pin head + r_data = u3i_bytes(((n16+1)*2+1)*sizeof(c3_y), x_bytes16); + u3a_free(x_bytes16); + break; + + case 5: ; + float32_t a32, b32, interval32; + u3r_bytes(0, 4, (c3_y*)&(a32.v), a); + u3r_bytes(0, 4, (c3_y*)&(b32.v), b); + u3r_bytes(0, 4, (c3_y*)&(interval32.v), d); + c3_d n32 = f32_to_i64(f32_div(f32_sub(b32, a32), interval32), softfloat_round_minMag, false); + c3_y* x_bytes32 = (c3_y*)u3a_malloc(((n32+1)*4+1)*sizeof(c3_y)); + for (c3_d i = 1; i <= n32; i++) { + ((float32_t*)x_bytes32)[n32-i] = f32_add(a32, f32_mul(i32_to_f32(i), interval32)); + } + ((float32_t*)x_bytes32)[n32] = a32; + // ((float32_t*)x_bytes32)[0] = b32; + x_bytes32[(n32+1)*4] = 1; // pin head + r_data = u3i_bytes(((n32+1)*4+1)*sizeof(c3_y), x_bytes32); + u3a_free(x_bytes32); + break; + + case 6: ; + float64_t a64, b64, interval64; + u3r_bytes(0, 8, (c3_y*)&(a64.v), a); + u3r_bytes(0, 8, (c3_y*)&(b64.v), b); + u3r_bytes(0, 8, (c3_y*)&(interval64.v), d); + c3_d n64 = f64_to_i64(f64_div(f64_sub(b64, a64), interval64), softfloat_round_minMag, false); + c3_y* x_bytes64 = (c3_y*)u3a_malloc(((n64+1)*8+1)*sizeof(c3_y)); + for (c3_d i = 1; i < n64; i++) { + ((float64_t*)x_bytes64)[n64-i] = f64_add(a64, f64_mul(i32_to_f64(i), interval64)); + } + ((float64_t*)x_bytes64)[n64] = a64; + // ((float64_t*)x_bytes64)[0] = b64; + x_bytes64[(n64+1)*8] = 1; // pin head + r_data = u3i_bytes(((n64+1)*8+1)*sizeof(c3_y), x_bytes64); + u3a_free(x_bytes64); + break; + + case 7: ; + float128_t a128, b128, interval128; + u3r_bytes(0, 16, (c3_y*)&(a128.v[0]), a); + u3r_bytes(0, 16, (c3_y*)&(b128.v[0]), b); + u3r_bytes(0, 16, (c3_y*)&(interval128.v), d); + c3_d n128 = f128_to_i64(f128_div(f128_sub(b128, a128), interval128), softfloat_round_minMag, false); + c3_y* x_bytes128 = (c3_y*)u3a_malloc(((n128+1)*16+1)*sizeof(c3_y)); + float128_t i128; + for (c3_d i = 1; i < n128; i++) { + i32_to_f128M(i, &i128); + f128M_mul(&i128, &interval128, &((float128_t*)x_bytes128)[n128-i]); + f128M_add(&a128, &((float128_t*)x_bytes128)[n128-i], &((float128_t*)x_bytes128)[n128-i]); + } + ((float128_t*)x_bytes128)[n128] = a128; + // ((float128_t*)x_bytes128)[0] = b128; + x_bytes128[(n128+1)*16] = 1; // pin head + r_data = u3i_bytes(((n128+1)*16+1)*sizeof(c3_y), x_bytes128); + u3a_free(x_bytes128); + break; + } + + return r_data; + } + /* trace - tr(x) */ u3_noun @@ -1936,10 +2035,12 @@ c3_d Nb = u3h(y_shape); c3_d P = u3h(u3t(y_shape)); - assert(u3_nul == u3t(u3t(x_shape))); - assert(Na == Nb); + if ((u3_nul != u3t(u3t(x_shape))) || + (u3_nul != u3t(u3t(y_shape))) || + (Na != Nb)) { + return u3m_bail(c3__exit); + } c3_d N = Na; - assert(u3_nul == u3t(u3t(y_shape))); c3_y* x_bytes = (c3_y*)u3a_malloc((M*N)*sizeof(c3_y)); u3r_bytes(0, M*N, x_bytes, x_data); @@ -3029,7 +3130,7 @@ u3_noun u3wf_la_linspace(u3_noun cor) { - u3_noun x_meta, a, b, n; + u3_noun x_meta, a, b, n, rnd; if ( c3n == u3r_mean(cor, u3x_sam_2, &x_meta, @@ -3045,6 +3146,7 @@ x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 x_fxp = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == u3ud(x_bloq) || c3n == u3ud(x_kind) ) @@ -3052,9 +3154,79 @@ return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: ; + case c3__real: + _set_rounding(rnd); u3_noun r_data = u3qf_la_linspace_real(a, b, n, x_bloq); - x_shape = u3nc(u3x_atom(n)+1, u3_nul); + x_shape = u3nt(u3x_atom(n), 0x1, u3_nul); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wf_la_range(u3_noun cor) + { + u3_noun x_meta, a, b, d, rnd; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_12, &a, + u3x_sam_13, &b, + u3x_sam_7, &d, + 0)) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_fxp; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__real: + _set_rounding(rnd); + u3_noun r_data = u3qf_la_range_real(a, b, d, x_bloq); + c3_d a_, b_, d_; + c3_ds n_; + switch (x_bloq) { + case 4: + u3r_bytes(0, 2, (c3_y*)&a_, a); + u3r_bytes(0, 2, (c3_y*)&b_, b); + u3r_bytes(0, 2, (c3_y*)&d_, d); + n_ = f16_to_i64(f16_div(f16_sub((float16_t){b_}, (float16_t){a_}), (float16_t){d_}), softfloat_round_minMag, false); + break; + case 5: + u3r_bytes(0, 4, (c3_y*)&a_, a); + u3r_bytes(0, 4, (c3_y*)&b_, b); + u3r_bytes(0, 4, (c3_y*)&d_, d); + n_ = f32_to_i64(f32_div(f32_sub((float32_t){b_}, (float32_t){a_}), (float32_t){d_}), softfloat_round_minMag, false); + break; + case 6: + u3r_bytes(0, 8, (c3_y*)&a_, a); + u3r_bytes(0, 8, (c3_y*)&b_, b); + u3r_bytes(0, 8, (c3_y*)&d_, d); + n_ = f64_to_i64(f64_div(f64_sub((float64_t){b_}, (float64_t){a_}), (float64_t){d_}), softfloat_round_minMag, false); + break; + case 7: + u3r_bytes(0, 16, (c3_y*)&a_, a); + u3r_bytes(0, 16, (c3_y*)&b_, b); + u3r_bytes(0, 16, (c3_y*)&d_, d); + n_ = f128_to_i64(f128_div(f128_sub((float128_t){b_}, (float128_t){a_}), (float128_t){d_}), softfloat_round_minMag, false); + break; + } + u3_noun n = u3i_chub(n_+1); + x_shape = u3nt(u3k(n), 0x1, u3_nul); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index 0f38bb3a96..b5e5cbc8e1 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -260,7 +260,6 @@ u3_noun u3qf_la_dot_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_diag(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_transpose(u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_cumsum_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_argmin_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_argmax_real(u3_noun, u3_noun, u3_noun); @@ -268,13 +267,12 @@ u3_noun u3qf_la_min_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_max_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_linspace_real(u3_noun, u3_noun, u3_noun, u3_noun); - + u3_noun u3qf_la_range_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_abs_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_gth_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_gte_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_lth_real(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_lte_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_trace_real(u3_noun, u3_noun, u3_noun); u3_noun u3qf_la_mmul_real(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index 9dea54bcfc..dc27123a4d 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2163,7 +2163,6 @@ static u3j_harm _139_hex__lagoon_divs_a[] = {{".2", u3wf_la_divs}, {}}; static u3j_harm _139_hex__lagoon_mods_a[] = {{".2", u3wf_la_mods}, {}}; static u3j_harm _139_hex__lagoon_dot_a[] = {{".2", u3wf_la_dot}, {}}; static u3j_harm _139_hex__lagoon_trans_a[] ={{".2", u3wf_la_transpose}, {}}; -// static u3j_harm _139_hex__lagoon_stack_a[] ={{".2", u3wf_la_stack}, {}}; static u3j_harm _139_hex__lagoon_cumsum_a[]={{".2", u3wf_la_cumsum}, {}}; static u3j_harm _139_hex__lagoon_argmin_a[]={{".2", u3wf_la_argmin}, {}}; static u3j_harm _139_hex__lagoon_argmax_a[]={{".2", u3wf_la_argmax}, {}}; @@ -2171,8 +2170,7 @@ static u3j_harm _139_hex__lagoon_ravel_a[]={{".2", u3wf_la_ravel}, {}}; static u3j_harm _139_hex__lagoon_min_a[]={{".2", u3wf_la_min}, {}}; static u3j_harm _139_hex__lagoon_max_a[]={{".2", u3wf_la_max}, {}}; static u3j_harm _139_hex__lagoon_linspace_a[]={{".2", u3wf_la_linspace}, {}}; -// static u3j_harm _139_hex__lagoon_range_a[]={{".2", u3wf_la_range}, {}}; -// static u3j_harm _139_hex__lagoon_submatrix_a[]={{".2", u3wf_la_submatrix}, {}}; +static u3j_harm _139_hex__lagoon_range_a[]={{".2", u3wf_la_range}, {}}; static u3j_harm _139_hex__lagoon_abs_a[]={{".2", u3wf_la_abs}, {}}; static u3j_harm _139_hex__lagoon_gth_a[]={{".2", u3wf_la_gth}, {}}; static u3j_harm _139_hex__lagoon_gte_a[]={{".2", u3wf_la_gte}, {}}; @@ -2194,16 +2192,14 @@ static u3j_core _139_hex__la_core_d[] = { "mod-scal", 7, _139_hex__lagoon_mods_a, 0, no_hashes }, { "dot", 7, _139_hex__lagoon_dot_a, 0, no_hashes }, { "transpose",7, _139_hex__lagoon_trans_a, 0, no_hashes }, - // { "stack", 7, _139_hex__lagoon_stack_a, 0, no_hashes }, - // { "cumsum", 7, _139_hex__lagoon_cumsum_a, 0, no_hashes }, + { "cumsum", 7, _139_hex__lagoon_cumsum_a, 0, no_hashes }, { "argmin", 7, _139_hex__lagoon_argmin_a, 0, no_hashes }, { "argmax", 7, _139_hex__lagoon_argmax_a, 0, no_hashes }, - // { "ravel", 7, _139_hex__lagoon_ravel_a, 0, no_hashes }, + { "ravel", 7, _139_hex__lagoon_ravel_a, 0, no_hashes }, { "min", 7, _139_hex__lagoon_min_a, 0, no_hashes }, { "max", 7, _139_hex__lagoon_max_a, 0, no_hashes }, { "linspace", 7, _139_hex__lagoon_linspace_a, 0, no_hashes }, - // { "range", 7, _139_hex__lagoon_range_a, 0, no_hashes }, - // { "submatrix",7, _139_hex__lagoon_submatrix_a, 0, no_hashes }, + { "range", 7, _139_hex__lagoon_range_a, 0, no_hashes }, { "abs", 7, _139_hex__lagoon_abs_a, 0, no_hashes }, { "gth", 7, _139_hex__lagoon_gth_a, 0, no_hashes }, { "gte", 7, _139_hex__lagoon_gte_a, 0, no_hashes }, diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index 666b611bc5..e7976ba3b7 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -344,7 +344,6 @@ u3_noun u3wf_la_dot(u3_noun); u3_noun u3wf_la_diag(u3_noun); u3_noun u3wf_la_transpose(u3_noun); - u3_noun u3wf_la_cumsum(u3_noun); u3_noun u3wf_la_argmin(u3_noun); u3_noun u3wf_la_argmax(u3_noun); @@ -352,7 +351,7 @@ u3_noun u3wf_la_min(u3_noun); u3_noun u3wf_la_max(u3_noun); u3_noun u3wf_la_linspace(u3_noun); - + u3_noun u3wf_la_range(u3_noun); u3_noun u3wf_la_abs(u3_noun); u3_noun u3wf_la_gth(u3_noun); u3_noun u3wf_la_gte(u3_noun); From 338154c13e913c8248ba3d810af08db7429c89f3 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 17 Apr 2024 17:02:46 -0500 Subject: [PATCH 22/97] Post all jets. --- pkg/noun/jets/f/lagoon.c | 159 +++++++++++++++++++-------------------- 1 file changed, 79 insertions(+), 80 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index 8dc6144d2e..9e5215e387 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -128,7 +128,7 @@ u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: haxpy(len_x, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); break; @@ -186,7 +186,7 @@ u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: haxpy(len_x, (float16_t){SB_REAL16_NEGONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); break; @@ -245,7 +245,7 @@ u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: for (c3_d i = 0; i < len_x; i++) { ((float16_t*)y_bytes)[i] = f16_mul(((float16_t*)x_bytes)[i], ((float16_t*)y_bytes)[i]); @@ -311,7 +311,7 @@ u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: for (c3_d i = 0; i < len_x; i++) { ((float16_t*)y_bytes)[i] = f16_div(((float16_t*)x_bytes)[i], ((float16_t*)y_bytes)[i]); @@ -377,7 +377,7 @@ u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: for (c3_d i = 0; i < len_x; i++) { float16_t x_val16 = ((float16_t*)x_bytes)[i]; @@ -466,6 +466,7 @@ if (bloq < 4 || bloq > 7) { return u3_none; } + fprintf(stderr, "function rounding mode: %lx\r\n", softfloat_roundingMode); // Unpack the data as a byte array. We assume total length < 2**64. // len_x is length in base units @@ -481,7 +482,7 @@ u3_noun r_data; // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: ; float16_t sum16[2]; for (c3_d i = 0; i < len_x; i++) { @@ -512,7 +513,10 @@ case 7: ; float128_t sum128[2]; for (c3_d i = 0; i < len_x; i++) { + fprintf(stderr, " sum128[%d] = %lx %lx\r\n", i, sum128[i].v[0], sum128[i].v[1]); + fprintf(stderr, "x_bytes[%d] = %lx %lx\r\n", i, ((float128_t*)x_bytes)[i].v[0], ((float128_t*)x_bytes)[i].v[1]); f128M_add(&(sum128[0]), &(((float128_t*)x_bytes)[i]), &(sum128[0])); + fprintf(stderr, " equals[%d] = %lx %lx\r\n", i, sum128[i].v[0], sum128[i].v[1]); } sum128[1] = (float128_t){0x1, 0x0}; r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)sum128); @@ -551,7 +555,7 @@ c3_w min_idx = 0; // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: ; float16_t min_val16 = ((float16_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { @@ -624,7 +628,7 @@ c3_w max_idx = 0; // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: ; float16_t max_val16 = ((float16_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { @@ -696,10 +700,10 @@ u3r_bytes(0, syz_x, x_bytes, x_data); // r_data is the result noun of [data] - u3_noun r_data; + u3_noun r_data = u3_nul; // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: for (c3_d i = 0; i < len_x; i++) { float16_t x_val16 = ((float16_t*)x_bytes)[i]; @@ -761,7 +765,7 @@ u3_noun r_data; // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: ; float16_t min_val16 = ((float16_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { @@ -839,7 +843,7 @@ u3_noun r_data; // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: ; float16_t max_val16 = ((float16_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { @@ -915,7 +919,7 @@ u3r_bytes(0, syz_x+1, x_bytes, x_data); // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: for (c3_d i = 0; i < len_x; i++) { ((float16_t*)x_bytes)[i] = f16_abs(((float16_t*)x_bytes)[i]); @@ -979,7 +983,7 @@ u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: for (c3_d i = 0; i < len_x; i++) { float16_t x_val16 = ((float16_t*)x_bytes)[i]; @@ -1052,7 +1056,7 @@ u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: for (c3_d i = 0; i < len_x; i++) { float16_t x_val16 = ((float16_t*)x_bytes)[i]; @@ -1125,7 +1129,7 @@ u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: for (c3_d i = 0; i < len_x; i++) { float16_t x_val16 = ((float16_t*)x_bytes)[i]; @@ -1198,7 +1202,7 @@ u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: for (c3_d i = 0; i < len_x; i++) { float16_t x_val16 = ((float16_t*)x_bytes)[i]; @@ -1275,7 +1279,7 @@ float128_t n128; // Switch on the block size. We assume that n fits in the target block size; Hoon typecheck should prevent. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: u3r_bytes(0, 2, (c3_y*)&(n16.v), n); // set y to [n] @@ -1357,7 +1361,7 @@ float128_t n128; // Switch on the block size. We assume that n fits in the target block size; Hoon typecheck should prevent. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: u3r_bytes(0, 2, (c3_y*)&(n16.v), n); // set y to [n] @@ -1438,7 +1442,7 @@ float128_t n128; // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: u3r_bytes(0, 2, (c3_y*)&(n16.v), n); hscal(len_x, n16, (float16_t*)x_bytes, 1); @@ -1501,7 +1505,7 @@ float128_t in128; // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: // XX note that in16 is doing double duty here u3r_bytes(0, 2, (c3_y*)&(in16.v), n); @@ -1572,7 +1576,7 @@ float128_t n128, in128; // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: u3r_bytes(0, 2, (c3_y*)&(n16.v), n); in16 = f16_div((float16_t){SB_REAL16_ONE}, n16); @@ -1688,7 +1692,7 @@ u3_noun r_data; // Switch on the block size. - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: ; float16_t r16[2]; r16[0] = hdot(len_x, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); @@ -1836,7 +1840,7 @@ u3_noun r_data; - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: ; float16_t a16, b16; u3r_bytes(0, 2, (c3_y*)&(a16.v), a); @@ -1931,7 +1935,7 @@ u3_noun r_data; - switch (bloq) { + switch (u3x_atom(bloq)) { case 4: ; float16_t a16, b16, interval16; u3r_bytes(0, 2, (c3_y*)&(a16.v), a); @@ -2042,75 +2046,66 @@ } c3_d N = Na; - c3_y* x_bytes = (c3_y*)u3a_malloc((M*N)*sizeof(c3_y)); - u3r_bytes(0, M*N, x_bytes, x_data); - c3_y* y_bytes = (c3_y*)u3a_malloc((N*P)*sizeof(c3_y)); - u3r_bytes(0, N*P, y_bytes, y_data); - c3_y* c_bytes = (c3_y*)u3a_malloc((M*P)*sizeof(c3_y)); - - u3_noun r_data; - - // Switch on the block size. - switch (bloq) { - case 4: - hgemm('N', 'N', M, N, P, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, N, (float16_t*)y_bytes, N, (float16_t){SB_REAL16_ZERO}, (float16_t*)c_bytes, P); + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(x_shape); // M*N - // Unpack the result back into a noun. - r_data = u3i_bytes(M*P, c_bytes); + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); // M*N - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - u3a_free(c_bytes); + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*pow(2,bloq-3)*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); - return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); + // len_x is length in base units + c3_d len_y = _get_length(y_shape); // N*P - case 5: - sgemm('N', 'N', M, N, P, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, N, (float32_t*)y_bytes, N, (float32_t){SB_REAL32_ZERO}, (float32_t*)c_bytes, P); + // syz_x is length in bytes + c3_d syz_y = len_x * pow(2, bloq-3); // N*P - // Unpack the result back into a noun. - r_data = u3i_bytes(M*P, c_bytes); + // y_bytes is the data array (w/o leading 0x1) + c3_y* y_bytes = (c3_y*)u3a_malloc(syz_y*pow(2,bloq-3)*sizeof(c3_y)); + u3r_bytes(0, syz_y, y_bytes, y_data); - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - u3a_free(c_bytes); + // syz_r is length in bytes + c3_d syz_r = (M*P) * pow(2, bloq-3); // M*P + + // len_r is length in base units + c3_d len_r = M*P; // M*P - return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); + // r_bytes is the result array + c3_y* r_bytes = (c3_y*)u3a_malloc((syz_r*pow(2,bloq-3)+1)*sizeof(c3_y)); + r_bytes[syz_r] = 1; // pin head - case 6: - dgemm('N', 'N', M, N, P, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, N, (float64_t*)y_bytes, N, (float64_t){SB_REAL64_ZERO}, (float64_t*)c_bytes, P); + u3_noun r_data; - // Unpack the result back into a noun. - r_data = u3i_bytes(M*P, c_bytes); + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + hgemm('N', 'N', M, N, P, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, N, (float16_t*)y_bytes, N, (float16_t){SB_REAL16_ZERO}, (float16_t*)r_bytes, P); + break; - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - u3a_free(c_bytes); + case 5: + sgemm('N', 'N', M, N, P, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, N, (float32_t*)y_bytes, N, (float32_t){SB_REAL32_ZERO}, (float32_t*)r_bytes, P); + break; - return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); + case 6: + dgemm('N', 'N', M, N, P, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, N, (float64_t*)y_bytes, N, (float64_t){SB_REAL64_ZERO}, (float64_t*)r_bytes, P); + break; case 7: - qgemm('N', 'N', M, N, P, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, N, (float128_t*)y_bytes, N, (float128_t){SB_REAL128L_ZERO,SB_REAL128U_ZERO}, (float128_t*)c_bytes, P); - - // Unpack the result back into a noun. - r_data = u3i_bytes(M*P, c_bytes); + qgemm('N', 'N', M, N, P, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, N, (float128_t*)y_bytes, N, (float128_t){SB_REAL128L_ZERO,SB_REAL128U_ZERO}, (float128_t*)r_bytes, P); + break; + } - // Clean up. - u3a_free(x_bytes); - u3a_free(y_bytes); - u3a_free(c_bytes); + // Unpack the result back into a noun. + r_data = u3i_bytes(syz_r*pow(2,bloq-3)+1, r_bytes); - return u3nc(u3nq(u3nl(M, P, u3_none), bloq, c3__real, u3_nul), r_data); + u3a_free(x_bytes); + u3a_free(y_bytes); + u3a_free(r_bytes); - default: - u3a_free(x_bytes); - u3a_free(y_bytes); - u3a_free(c_bytes); - - return u3_none; - } + return u3nc(u3nq(u3nt(u3k(M), u3k(P), u3_nul), u3k(bloq), c3__real, u3_nul), u3k(r_data)); } u3_noun @@ -2419,6 +2414,8 @@ case c3__real: _set_rounding(rnd); u3_noun r_data = u3qf_la_cumsum_real(x_data, x_shape, x_bloq); + fprintf(stderr, "desired rounding mode: %lx\r\n", rnd); + fprintf(stderr, "apparent rounding mode: %lx\r\n", softfloat_roundingMode); return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -3349,8 +3346,10 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - return u3qf_la_mmul_real(x_data, y_data, x_shape, y_shape, x_bloq); - break; + u3_noun r_data; + r_data = u3qf_la_mmul_real(x_data, y_data, x_shape, y_shape, x_bloq); + // result is already [meta data] + return u3k(r_data); default: return u3_none; From 6f2fc23c6d97f4b206cc239e7bb156f906f30ce5 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 17 Apr 2024 19:40:06 -0500 Subject: [PATCH 23/97] Fix cumsum. --- pkg/noun/jets/f/lagoon.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index 9e5215e387..0d16f70eb5 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -485,6 +485,7 @@ switch (u3x_atom(bloq)) { case 4: ; float16_t sum16[2]; + sum16[0] = (float16_t){SB_REAL16_ZERO}; for (c3_d i = 0; i < len_x; i++) { sum16[0] = f16_add(sum16[0], ((float16_t*)x_bytes)[i]); } @@ -494,6 +495,7 @@ case 5: ; float32_t sum32[2]; + sum32[0] = (float32_t){SB_REAL32_ZERO}; for (c3_d i = 0; i < len_x; i++) { sum32[0] = f32_add(sum32[0], ((float32_t*)x_bytes)[i]); } @@ -503,6 +505,7 @@ case 6: ; float64_t sum64[2]; + sum64[0] = (float64_t){SB_REAL64_ZERO}; for (c3_d i = 0; i < len_x; i++) { sum64[0] = f64_add(sum64[0], ((float64_t*)x_bytes)[i]); } @@ -512,11 +515,9 @@ case 7: ; float128_t sum128[2]; + sum128[0] = (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; for (c3_d i = 0; i < len_x; i++) { - fprintf(stderr, " sum128[%d] = %lx %lx\r\n", i, sum128[i].v[0], sum128[i].v[1]); - fprintf(stderr, "x_bytes[%d] = %lx %lx\r\n", i, ((float128_t*)x_bytes)[i].v[0], ((float128_t*)x_bytes)[i].v[1]); f128M_add(&(sum128[0]), &(((float128_t*)x_bytes)[i]), &(sum128[0])); - fprintf(stderr, " equals[%d] = %lx %lx\r\n", i, sum128[i].v[0], sum128[i].v[1]); } sum128[1] = (float128_t){0x1, 0x0}; r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)sum128); @@ -2414,8 +2415,6 @@ case c3__real: _set_rounding(rnd); u3_noun r_data = u3qf_la_cumsum_real(x_data, x_shape, x_bloq); - fprintf(stderr, "desired rounding mode: %lx\r\n", rnd); - fprintf(stderr, "apparent rounding mode: %lx\r\n", softfloat_roundingMode); return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2640,7 +2639,6 @@ switch (x_kind) { case c3__real: ; u3_noun r_data = u3qf_la_abs_real(x_data, x_shape, x_bloq); - fprintf(stderr, "abs: %lx\n", x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: From 90014967eb162dafdf62ebc778339bf8d44929b8 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Thu, 18 Apr 2024 14:26:51 -0500 Subject: [PATCH 24/97] Post working version of all jets for reals to date. --- pkg/noun/jets/f/lagoon.c | 106 +++++++++++++++++++-------------------- pkg/noun/jets/tree.c | 20 ++++---- 2 files changed, 62 insertions(+), 64 deletions(-) diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/f/lagoon.c index 0d16f70eb5..8fd95be2ab 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/f/lagoon.c @@ -466,7 +466,6 @@ if (bloq < 4 || bloq > 7) { return u3_none; } - fprintf(stderr, "function rounding mode: %lx\r\n", softfloat_roundingMode); // Unpack the data as a byte array. We assume total length < 2**64. // len_x is length in base units @@ -1319,7 +1318,7 @@ } // r_data is the result noun of [data] - y_bytes[syz_x] = 1; // pin head + y_bytes[syz_x] = 0x1; // pin head u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); // Clean up and return. @@ -1401,7 +1400,7 @@ } // r_data is the result noun of [data] - x_bytes[syz_x] = 1; // pin head + x_bytes[syz_x] = 0x1; // pin head u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); // Clean up and return. @@ -1435,7 +1434,7 @@ // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); u3r_bytes(0, syz_x, x_bytes, x_data); - x_bytes[syz_x] = 1; // pin head + x_bytes[syz_x] = 0x1; // pin head float16_t n16; float32_t n32; @@ -1498,7 +1497,7 @@ // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); u3r_bytes(0, syz_x, x_bytes, x_data); - x_bytes[syz_x] = 1; // pin head + x_bytes[syz_x] = 0x1; // pin head float16_t in16; float32_t in32; @@ -1769,7 +1768,7 @@ y_bytes[i*wyd+k] = x_bytes[(i*dims[0]+i)*wyd+k]; } } - y_bytes[syz_y] = 1; // pin head + y_bytes[syz_y] = 0x1; // pin head // Unpack the result back into a noun. r_data = u3i_bytes((syz_y+1)*sizeof(c3_y), y_bytes); @@ -1814,7 +1813,7 @@ } } } - y_bytes[syz_x] = 1; // pin head + y_bytes[syz_x] = 0x1; // pin head // Unpack the result back into a noun. r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); @@ -1854,7 +1853,7 @@ } ((float16_t*)x_bytes16)[n] = a16; ((float16_t*)x_bytes16)[0] = b16; - x_bytes16[(n+1)*2] = 1; // pin head + x_bytes16[(n+1)*2] = 0x1; // pin head r_data = u3i_bytes(((n+1)*2+1)*sizeof(c3_y), x_bytes16); u3a_free(x_bytes16); break; @@ -1871,7 +1870,7 @@ } ((float32_t*)x_bytes32)[n] = a32; ((float32_t*)x_bytes32)[0] = b32; - x_bytes32[(n+1)*4] = 1; // pin head + x_bytes32[(n+1)*4] = 0x1; // pin head r_data = u3i_bytes(((n+1)*4+1)*sizeof(c3_y), x_bytes32); u3a_free(x_bytes32); break; @@ -1888,7 +1887,7 @@ } ((float64_t*)x_bytes64)[n] = a64; ((float64_t*)x_bytes64)[0] = b64; - x_bytes64[(n+1)*8] = 1; // pin head + x_bytes64[(n+1)*8] = 0x1; // pin head r_data = u3i_bytes(((n+1)*8+1)*sizeof(c3_y), x_bytes64); u3a_free(x_bytes64); break; @@ -1912,7 +1911,7 @@ } ((float128_t*)x_bytes128)[n] = a128; ((float128_t*)x_bytes128)[0] = b128; - x_bytes128[(n+1)*16] = 1; // pin head + x_bytes128[(n+1)*16] = 0x1; // pin head r_data = u3i_bytes(((n+1)*16+1)*sizeof(c3_y), x_bytes128); u3a_free(x_bytes128); break; @@ -1949,7 +1948,7 @@ } ((float16_t*)x_bytes16)[n16] = a16; // ((float16_t*)x_bytes16)[0] = b16; - x_bytes16[(n16+1)*2] = 1; // pin head + x_bytes16[(n16+1)*2] = 0x1; // pin head r_data = u3i_bytes(((n16+1)*2+1)*sizeof(c3_y), x_bytes16); u3a_free(x_bytes16); break; @@ -1966,7 +1965,7 @@ } ((float32_t*)x_bytes32)[n32] = a32; // ((float32_t*)x_bytes32)[0] = b32; - x_bytes32[(n32+1)*4] = 1; // pin head + x_bytes32[(n32+1)*4] = 0x1; // pin head r_data = u3i_bytes(((n32+1)*4+1)*sizeof(c3_y), x_bytes32); u3a_free(x_bytes32); break; @@ -1983,7 +1982,7 @@ } ((float64_t*)x_bytes64)[n64] = a64; // ((float64_t*)x_bytes64)[0] = b64; - x_bytes64[(n64+1)*8] = 1; // pin head + x_bytes64[(n64+1)*8] = 0x1; // pin head r_data = u3i_bytes(((n64+1)*8+1)*sizeof(c3_y), x_bytes64); u3a_free(x_bytes64); break; @@ -2003,7 +2002,7 @@ } ((float128_t*)x_bytes128)[n128] = a128; // ((float128_t*)x_bytes128)[0] = b128; - x_bytes128[(n128+1)*16] = 1; // pin head + x_bytes128[(n128+1)*16] = 0x1; // pin head r_data = u3i_bytes(((n128+1)*16+1)*sizeof(c3_y), x_bytes128); u3a_free(x_bytes128); break; @@ -2021,7 +2020,7 @@ { u3_noun d_data = u3qf_la_diag(x_data, shape, bloq); c3_d len_x0 = _get_dims(shape)[0]; - u3_noun r_data = u3qf_la_dot_real(d_data, d_data, u3nt(len_x0, 0x1, u3_nul), bloq); + u3_noun r_data = u3qf_la_dot_real(d_data, d_data, u3nt(len_x0, 0x1, u3_nul), u3k(bloq)); return r_data; } @@ -2035,10 +2034,10 @@ u3_noun bloq) { // Unpack the data as a byte array. We assume total length < 2**64. - c3_d M = u3h(x_shape); - c3_d Na = u3h(u3t(x_shape)); - c3_d Nb = u3h(y_shape); - c3_d P = u3h(u3t(y_shape)); + c3_d M = u3x_atom(u3h(x_shape)); + c3_d Na= u3x_atom(u3h(u3t(x_shape))); + c3_d Nb= u3x_atom(u3h(y_shape)); + c3_d P = u3x_atom(u3h(u3t(y_shape))); if ((u3_nul != u3t(u3t(x_shape))) || (u3_nul != u3t(u3t(y_shape))) || @@ -2055,58 +2054,62 @@ c3_d syz_x = len_x * pow(2, bloq-3); // M*N // x_bytes is the data array (w/o leading 0x1) - c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*pow(2,bloq-3)*sizeof(c3_y)); + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); u3r_bytes(0, syz_x, x_bytes, x_data); // len_x is length in base units c3_d len_y = _get_length(y_shape); // N*P // syz_x is length in bytes - c3_d syz_y = len_x * pow(2, bloq-3); // N*P + c3_d syz_y = len_y * pow(2, bloq-3); // N*P // y_bytes is the data array (w/o leading 0x1) - c3_y* y_bytes = (c3_y*)u3a_malloc(syz_y*pow(2,bloq-3)*sizeof(c3_y)); + c3_y* y_bytes = (c3_y*)u3a_malloc(syz_y*sizeof(c3_y)); u3r_bytes(0, syz_y, y_bytes, y_data); - - // syz_r is length in bytes - c3_d syz_r = (M*P) * pow(2, bloq-3); // M*P // len_r is length in base units c3_d len_r = M*P; // M*P - // r_bytes is the result array - c3_y* r_bytes = (c3_y*)u3a_malloc((syz_r*pow(2,bloq-3)+1)*sizeof(c3_y)); - r_bytes[syz_r] = 1; // pin head + // syz_r is length in bytes + c3_d syz_r = len_r * pow(2, bloq-3); // M*P - u3_noun r_data; + // r_bytes is the result array + c3_y* r_bytes = (c3_y*)u3a_malloc((syz_r+1)*sizeof(c3_y)); + r_bytes[syz_r] = 0x1; // pin head + // initialize with 0x0s + for (c3_d i = 0; i < syz_r; i++) { + r_bytes[i] = 0x0; + } // Switch on the block size. switch (u3x_atom(bloq)) { case 4: - hgemm('N', 'N', M, N, P, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, N, (float16_t*)y_bytes, N, (float16_t){SB_REAL16_ZERO}, (float16_t*)r_bytes, P); + hgemm('N', 'N', M, N, P, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, N, (float16_t*)y_bytes, P, (float16_t){SB_REAL16_ZERO}, (float16_t*)r_bytes, P); break; case 5: - sgemm('N', 'N', M, N, P, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, N, (float32_t*)y_bytes, N, (float32_t){SB_REAL32_ZERO}, (float32_t*)r_bytes, P); + sgemm('N', 'N', M, N, P, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, N, (float32_t*)y_bytes, P, (float32_t){SB_REAL32_ZERO}, (float32_t*)r_bytes, P); break; case 6: - dgemm('N', 'N', M, N, P, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, N, (float64_t*)y_bytes, N, (float64_t){SB_REAL64_ZERO}, (float64_t*)r_bytes, P); + dgemm('N', 'N', M, N, P, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, N, (float64_t*)y_bytes, P, (float64_t){SB_REAL64_ZERO}, (float64_t*)r_bytes, P); break; case 7: - qgemm('N', 'N', M, N, P, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, N, (float128_t*)y_bytes, N, (float128_t){SB_REAL128L_ZERO,SB_REAL128U_ZERO}, (float128_t*)r_bytes, P); + qgemm('N', 'N', M, N, P, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, N, (float128_t*)y_bytes, P, (float128_t){SB_REAL128L_ZERO,SB_REAL128U_ZERO}, (float128_t*)r_bytes, P); break; } // Unpack the result back into a noun. - r_data = u3i_bytes(syz_r*pow(2,bloq-3)+1, r_bytes); + u3_noun r_data = u3i_bytes(syz_r+1, r_bytes); + u3_noun M_ = u3i_chub(M); + u3_noun P_ = u3i_chub(P); u3a_free(x_bytes); u3a_free(y_bytes); u3a_free(r_bytes); - return u3nc(u3nq(u3nt(u3k(M), u3k(P), u3_nul), u3k(bloq), c3__real, u3_nul), u3k(r_data)); + return u3nc(u3nq(u3nt(M_, P_, u3_nul), u3k(bloq), c3__real, u3_nul), r_data); } u3_noun @@ -3321,22 +3324,18 @@ u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, rnd; - if ( c3n == u3r_mean(x_meta, - 2, &x_shape, - 6, &x_bloq, - 14, &x_kind, - 15, &x_fxp, - 0) || - c3n == u3r_mean(y_meta, - 2, &y_shape, - 6, &y_bloq, - 14, &y_kind, - 15, &y_fxp, - 0) || - c3n == u3r_sing(x_bloq, y_bloq) || - c3n == u3r_sing(x_kind, y_kind) || + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_fxp = u3t(u3t(u3t(x_meta))); // 15 + y_shape = u3h(y_meta); // 2 + y_bloq = u3h(u3t(y_meta)); // 6 + y_kind = u3h(u3t(u3t(y_meta))); // 14 + y_fxp = u3t(u3t(u3t(y_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3r_sing(x_bloq, y_bloq) || + c3n == u3r_sing(x_kind, y_kind) // fxp does not need to match so no check - c3n == u3r_mean(cor, u3x_con_sam, &rnd, 0) ) { return u3m_bail(c3__exit); @@ -3344,10 +3343,9 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data; - r_data = u3qf_la_mmul_real(x_data, y_data, x_shape, y_shape, x_bloq); + u3_noun r_data = u3qf_la_mmul_real(x_data, y_data, x_shape, y_shape, x_bloq); // result is already [meta data] - return u3k(r_data); + return r_data; default: return u3_none; diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index dc27123a4d..35573a42e6 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2148,8 +2148,8 @@ static u3j_core _139_hex_json_d[] = }; /* linear algebra jets - XX move to outer _sep_ core for /lib? eventually -static u3j_core _139_sep_d[] = + XX move to outer _hep_ core for /lib? eventually +static u3j_core _139_hep_d[] = */ static u3j_harm _139_hex__lagoon_add_a[] = {{".2", u3wf_la_add}, {}}; static u3j_harm _139_hex__lagoon_sub_a[] = {{".2", u3wf_la_sub}, {}}; @@ -2167,15 +2167,15 @@ static u3j_harm _139_hex__lagoon_cumsum_a[]={{".2", u3wf_la_cumsum}, {}}; static u3j_harm _139_hex__lagoon_argmin_a[]={{".2", u3wf_la_argmin}, {}}; static u3j_harm _139_hex__lagoon_argmax_a[]={{".2", u3wf_la_argmax}, {}}; static u3j_harm _139_hex__lagoon_ravel_a[]={{".2", u3wf_la_ravel}, {}}; -static u3j_harm _139_hex__lagoon_min_a[]={{".2", u3wf_la_min}, {}}; -static u3j_harm _139_hex__lagoon_max_a[]={{".2", u3wf_la_max}, {}}; +static u3j_harm _139_hex__lagoon_min_a[] = {{".2", u3wf_la_min}, {}}; +static u3j_harm _139_hex__lagoon_max_a[] = {{".2", u3wf_la_max}, {}}; static u3j_harm _139_hex__lagoon_linspace_a[]={{".2", u3wf_la_linspace}, {}}; -static u3j_harm _139_hex__lagoon_range_a[]={{".2", u3wf_la_range}, {}}; -static u3j_harm _139_hex__lagoon_abs_a[]={{".2", u3wf_la_abs}, {}}; -static u3j_harm _139_hex__lagoon_gth_a[]={{".2", u3wf_la_gth}, {}}; -static u3j_harm _139_hex__lagoon_gte_a[]={{".2", u3wf_la_gte}, {}}; -static u3j_harm _139_hex__lagoon_lth_a[]={{".2", u3wf_la_lth}, {}}; -static u3j_harm _139_hex__lagoon_lte_a[]={{".2", u3wf_la_lte}, {}}; +static u3j_harm _139_hex__lagoon_range_a[]= {{".2", u3wf_la_range}, {}}; +static u3j_harm _139_hex__lagoon_abs_a[] = {{".2", u3wf_la_abs}, {}}; +static u3j_harm _139_hex__lagoon_gth_a[] = {{".2", u3wf_la_gth}, {}}; +static u3j_harm _139_hex__lagoon_gte_a[] = {{".2", u3wf_la_gte}, {}}; +static u3j_harm _139_hex__lagoon_lth_a[] = {{".2", u3wf_la_lth}, {}}; +static u3j_harm _139_hex__lagoon_lte_a[] = {{".2", u3wf_la_lte}, {}}; static u3j_harm _139_hex__lagoon_diag_a[] = {{".2", u3wf_la_diag}, {}}; static u3j_harm _139_hex__lagoon_trace_a[]= {{".2", u3wf_la_trace}, {}}; static u3j_harm _139_hex__lagoon_mmul_a[] = {{".2", u3wf_la_mmul}, {}}; From e43d2101790208d23c82f679a8a590d4a4139028 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 24 Apr 2024 10:51:09 -0500 Subject: [PATCH 25/97] Post with things moved and support for ARM. --- bazel/third_party/softblas/softblas.BUILD | 62 ++++++++ pkg/noun/jets/{f => i}/lagoon.c | 172 +++++++++++----------- pkg/noun/jets/q.h | 56 +++---- pkg/noun/jets/tree.c | 132 +++++++++-------- pkg/noun/jets/w.h | 58 ++++---- 5 files changed, 273 insertions(+), 207 deletions(-) rename pkg/noun/jets/{f => i}/lagoon.c (96%) diff --git a/bazel/third_party/softblas/softblas.BUILD b/bazel/third_party/softblas/softblas.BUILD index 3442c5da45..34c80c93e6 100644 --- a/bazel/third_party/softblas/softblas.BUILD +++ b/bazel/third_party/softblas/softblas.BUILD @@ -5,6 +5,68 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") cc_library( name = "softblas", visibility = ["//visibility:public"], + deps = select({ + "@platforms//cpu:aarch64": [":softblas_aarch64"], + "@platforms//cpu:x86_64": [":softblas_x86_64"], + "//conditions:default": [], + }), +) + +cc_library( + name = "softblas_aarch64", + visibility = ["//visibility:public"], + hdrs = ["include/softblas.h"], + includes = ["include"], + srcs = [ + "include/softblas.h", + "src/softblas_state.c", + "src/blas/level1/sasum.c", + "src/blas/level1/dasum.c", + "src/blas/level1/hasum.c", + "src/blas/level1/qasum.c", + "src/blas/level1/saxpy.c", + "src/blas/level1/daxpy.c", + "src/blas/level1/haxpy.c", + "src/blas/level1/qaxpy.c", + "src/blas/level1/scopy.c", + "src/blas/level1/dcopy.c", + "src/blas/level1/hcopy.c", + "src/blas/level1/qcopy.c", + "src/blas/level1/sdot.c", + "src/blas/level1/ddot.c", + "src/blas/level1/hdot.c", + "src/blas/level1/qdot.c", + "src/blas/level1/snrm2.c", + "src/blas/level1/dnrm2.c", + "src/blas/level1/hnrm2.c", + "src/blas/level1/qnrm2.c", + "src/blas/level1/sscal.c", + "src/blas/level1/dscal.c", + "src/blas/level1/hscal.c", + "src/blas/level1/qscal.c", + "src/blas/level1/sswap.c", + "src/blas/level1/dswap.c", + "src/blas/level1/hswap.c", + "src/blas/level1/qswap.c", + "src/blas/level1/isamax.c", + "src/blas/level1/idamax.c", + "src/blas/level1/ihamax.c", + "src/blas/level1/iqamax.c", + "src/blas/level2/sgemv.c", + "src/blas/level2/dgemv.c", + "src/blas/level2/hgemv.c", + "src/blas/level2/qgemv.c", + "src/blas/level3/sgemm.c", + "src/blas/level3/dgemm.c", + "src/blas/level3/hgemm.c", + "src/blas/level3/qgemm.c" + ], + deps = ["@softfloat"], +) + +cc_library( + name = "softblas_x86_64", + visibility = ["//visibility:public"], hdrs = ["include/softblas.h"], includes = ["include"], srcs = [ diff --git a/pkg/noun/jets/f/lagoon.c b/pkg/noun/jets/i/lagoon.c similarity index 96% rename from pkg/noun/jets/f/lagoon.c rename to pkg/noun/jets/i/lagoon.c index 8fd95be2ab..be9b8f6791 100644 --- a/pkg/noun/jets/f/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -101,7 +101,7 @@ /* add - axpy = 1*x+y */ u3_noun - u3qf_la_add_real(u3_noun x_data, + u3qi_la_add_real(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq @@ -159,7 +159,7 @@ /* sub - axpy = -1*y+x */ u3_noun - u3qf_la_sub_real(u3_noun x_data, + u3qi_la_sub_real(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq @@ -219,7 +219,7 @@ elementwise multiplication */ u3_noun - u3qf_la_mul_real(u3_noun x_data, + u3qi_la_mul_real(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -285,7 +285,7 @@ elementwise division */ u3_noun - u3qf_la_div_real(u3_noun x_data, + u3qi_la_div_real(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -351,7 +351,7 @@ remainder after division */ u3_noun - u3qf_la_mod_real(u3_noun x_data, + u3qi_la_mod_real(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -458,7 +458,7 @@ /* cumsum - x[0] + x[1] + ... x[n] */ u3_noun - u3qf_la_cumsum_real(u3_noun x_data, + u3qi_la_cumsum_real(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -532,7 +532,7 @@ /* argmin - argmin(x) */ u3_noun - u3qf_la_argmin_real(u3_noun x_data, + u3qi_la_argmin_real(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -605,7 +605,7 @@ /* argmax - argmax(x) */ u3_noun - u3qf_la_argmax_real(u3_noun x_data, + u3qi_la_argmax_real(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -679,7 +679,7 @@ entire nd-array busted out as a linear list */ u3_noun - u3qf_la_ravel_real(u3_noun x_data, + u3qi_la_ravel_real(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -742,7 +742,7 @@ /* min - min(x,y) */ u3_noun - u3qf_la_min_real(u3_noun x_data, + u3qi_la_min_real(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -820,7 +820,7 @@ /* max - max(x,y) */ u3_noun - u3qf_la_max_real(u3_noun x_data, + u3qi_la_max_real(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -898,7 +898,7 @@ /* abs - |x| */ u3_noun - u3qf_la_abs_real(u3_noun x_data, + u3qi_la_abs_real(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -957,7 +957,7 @@ /* gth - x > y */ u3_noun - u3qf_la_gth_real(u3_noun x_data, + u3qi_la_gth_real(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -1030,7 +1030,7 @@ /* gte - x > y */ u3_noun - u3qf_la_gte_real(u3_noun x_data, + u3qi_la_gte_real(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -1103,7 +1103,7 @@ /* lth - x > y */ u3_noun - u3qf_la_lth_real(u3_noun x_data, + u3qi_la_lth_real(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -1176,7 +1176,7 @@ /* lte - x > y */ u3_noun - u3qf_la_lte_real(u3_noun x_data, + u3qi_la_lte_real(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -1249,7 +1249,7 @@ /* adds - axpy = 1*x+[n] */ u3_noun - u3qf_la_adds_real(u3_noun x_data, + u3qi_la_adds_real(u3_noun x_data, u3_noun n, u3_noun shape, u3_noun bloq) @@ -1331,7 +1331,7 @@ /* subs - axpy = -1*[n]+x */ u3_noun - u3qf_la_subs_real(u3_noun x_data, + u3qi_la_subs_real(u3_noun x_data, u3_noun n, u3_noun shape, u3_noun bloq) @@ -1414,7 +1414,7 @@ elementwise multiplication */ u3_noun - u3qf_la_muls_real(u3_noun x_data, + u3qi_la_muls_real(u3_noun x_data, u3_noun n, u3_noun shape, u3_noun bloq) @@ -1477,7 +1477,7 @@ elementwise division */ u3_noun - u3qf_la_divs_real(u3_noun x_data, + u3qi_la_divs_real(u3_noun x_data, u3_noun n, u3_noun shape, u3_noun bloq) @@ -1548,7 +1548,7 @@ remainder after scalar division */ u3_noun - u3qf_la_mods_real(u3_noun x_data, + u3qi_la_mods_real(u3_noun x_data, u3_noun n, u3_noun shape, u3_noun bloq) @@ -1664,7 +1664,7 @@ /* dot - ?dot = x · y */ u3_noun - u3qf_la_dot_real(u3_noun x_data, + u3qi_la_dot_real(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -1732,7 +1732,7 @@ /* diag - diag(x) */ u3_noun - u3qf_la_diag(u3_noun x_data, + u3qi_la_diag(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -1783,7 +1783,7 @@ /* transpose - x' */ u3_noun - u3qf_la_transpose(u3_noun x_data, + u3qi_la_transpose(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -1828,7 +1828,7 @@ /* linspace - [a a+(b-a)/n ... b] */ u3_noun - u3qf_la_linspace_real(u3_noun a, + u3qi_la_linspace_real(u3_noun a, u3_noun b, u3_noun n, u3_noun bloq) @@ -1923,7 +1923,7 @@ /* range - [a a+d ... b] */ u3_noun - u3qf_la_range_real(u3_noun a, + u3qi_la_range_real(u3_noun a, u3_noun b, u3_noun d, u3_noun bloq) @@ -2014,20 +2014,20 @@ /* trace - tr(x) */ u3_noun - u3qf_la_trace_real(u3_noun x_data, + u3qi_la_trace_real(u3_noun x_data, u3_noun shape, u3_noun bloq) { - u3_noun d_data = u3qf_la_diag(x_data, shape, bloq); + u3_noun d_data = u3qi_la_diag(x_data, shape, bloq); c3_d len_x0 = _get_dims(shape)[0]; - u3_noun r_data = u3qf_la_dot_real(d_data, d_data, u3nt(len_x0, 0x1, u3_nul), u3k(bloq)); + u3_noun r_data = u3qi_la_dot_real(d_data, d_data, u3nt(len_x0, 0x1, u3_nul), u3k(bloq)); return r_data; } /* mmul */ u3_noun - u3qf_la_mmul_real(u3_noun x_data, + u3qi_la_mmul_real(u3_noun x_data, u3_noun y_data, u3_noun x_shape, u3_noun y_shape, @@ -2113,7 +2113,7 @@ } u3_noun - u3wf_la_add(u3_noun cor) + u3wi_la_add(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, @@ -2157,7 +2157,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_add_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_add_real(x_data, y_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2168,7 +2168,7 @@ } u3_noun - u3wf_la_sub(u3_noun cor) + u3wi_la_sub(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, @@ -2212,7 +2212,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_sub_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_sub_real(x_data, y_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2223,7 +2223,7 @@ } u3_noun - u3wf_la_mul(u3_noun cor) + u3wi_la_mul(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, @@ -2267,7 +2267,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_mul_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_mul_real(x_data, y_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2278,7 +2278,7 @@ } u3_noun - u3wf_la_div(u3_noun cor) + u3wi_la_div(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, @@ -2322,7 +2322,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_div_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_div_real(x_data, y_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2333,7 +2333,7 @@ } u3_noun - u3wf_la_mod(u3_noun cor) + u3wi_la_mod(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, @@ -2377,7 +2377,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_mod_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_mod_real(x_data, y_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2388,7 +2388,7 @@ } u3_noun - u3wf_la_cumsum(u3_noun cor) + u3wi_la_cumsum(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data; @@ -2417,7 +2417,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_cumsum_real(x_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_cumsum_real(x_data, x_shape, x_bloq); return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2428,7 +2428,7 @@ } u3_noun - u3wf_la_argmin(u3_noun cor) + u3wi_la_argmin(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data; @@ -2454,7 +2454,7 @@ } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qf_la_argmin_real(x_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_argmin_real(x_data, x_shape, x_bloq); // bare atom (@ index) return r_data; @@ -2466,7 +2466,7 @@ } u3_noun - u3wf_la_ravel(u3_noun cor) + u3wi_la_ravel(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data; @@ -2491,7 +2491,7 @@ } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qf_la_ravel_real(x_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_ravel_real(x_data, x_shape, x_bloq); // (list @) return r_data; @@ -2503,7 +2503,7 @@ } u3_noun - u3wf_la_argmax(u3_noun cor) + u3wi_la_argmax(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data; @@ -2529,7 +2529,7 @@ } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qf_la_argmax_real(x_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_argmax_real(x_data, x_shape, x_bloq); // bare atom (@ index) return r_data; @@ -2541,7 +2541,7 @@ } u3_noun - u3wf_la_min(u3_noun cor) + u3wi_la_min(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data; @@ -2567,7 +2567,7 @@ } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qf_la_min_real(x_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_min_real(x_data, x_shape, x_bloq); return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2578,7 +2578,7 @@ } u3_noun - u3wf_la_max(u3_noun cor) + u3wi_la_max(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data; @@ -2604,7 +2604,7 @@ } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qf_la_max_real(x_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_max_real(x_data, x_shape, x_bloq); return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2615,7 +2615,7 @@ } u3_noun - u3wf_la_abs(u3_noun cor) + u3wi_la_abs(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data; @@ -2641,7 +2641,7 @@ } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qf_la_abs_real(x_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_abs_real(x_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2652,7 +2652,7 @@ } u3_noun - u3wf_la_gth(u3_noun cor) + u3wi_la_gth(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, @@ -2694,7 +2694,7 @@ } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qf_la_gth_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_gth_real(x_data, y_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2705,7 +2705,7 @@ } u3_noun - u3wf_la_gte(u3_noun cor) + u3wi_la_gte(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, @@ -2747,7 +2747,7 @@ } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qf_la_gte_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_gte_real(x_data, y_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2758,7 +2758,7 @@ } u3_noun - u3wf_la_lth(u3_noun cor) + u3wi_la_lth(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, @@ -2800,7 +2800,7 @@ } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qf_la_lth_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_lth_real(x_data, y_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2811,7 +2811,7 @@ } u3_noun - u3wf_la_lte(u3_noun cor) + u3wi_la_lte(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, @@ -2853,7 +2853,7 @@ } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qf_la_lte_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_lte_real(x_data, y_data, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2864,7 +2864,7 @@ } u3_noun - u3wf_la_adds(u3_noun cor) + u3wi_la_adds(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, n; @@ -2889,7 +2889,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_adds_real(x_data, n, x_shape, x_bloq); + u3_noun r_data = u3qi_la_adds_real(x_data, n, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2899,7 +2899,7 @@ } u3_noun - u3wf_la_subs(u3_noun cor) + u3wi_la_subs(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, n; @@ -2924,7 +2924,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_subs_real(x_data, n, x_shape, x_bloq); + u3_noun r_data = u3qi_la_subs_real(x_data, n, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2934,7 +2934,7 @@ } u3_noun - u3wf_la_muls(u3_noun cor) + u3wi_la_muls(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, n; @@ -2959,7 +2959,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_muls_real(x_data, n, x_shape, x_bloq); + u3_noun r_data = u3qi_la_muls_real(x_data, n, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2969,7 +2969,7 @@ } u3_noun - u3wf_la_divs(u3_noun cor) + u3wi_la_divs(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, n; @@ -2994,7 +2994,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_divs_real(x_data, n, x_shape, x_bloq); + u3_noun r_data = u3qi_la_divs_real(x_data, n, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -3004,7 +3004,7 @@ } u3_noun - u3wf_la_mods(u3_noun cor) + u3wi_la_mods(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, n; @@ -3029,7 +3029,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_mods_real(x_data, n, x_shape, x_bloq); + u3_noun r_data = u3qi_la_mods_real(x_data, n, x_shape, x_bloq); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -3039,7 +3039,7 @@ } u3_noun - u3wf_la_dot(u3_noun cor) + u3wi_la_dot(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, @@ -3083,7 +3083,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_dot_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_dot_real(x_data, y_data, x_shape, x_bloq); c3_d len_x0 = _get_dims(x_shape)[0]; return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); @@ -3095,7 +3095,7 @@ } u3_noun - u3wf_la_transpose(u3_noun cor) + u3wi_la_transpose(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data; @@ -3119,14 +3119,14 @@ { return u3m_bail(c3__exit); } else { - u3_noun r_data = u3qf_la_transpose(x_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_transpose(x_data, x_shape, x_bloq); return u3nc(u3nq(u3nt(u3k(u3h(x_shape)), u3k(u3h(u3t(x_shape))), u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); } } } u3_noun - u3wf_la_linspace(u3_noun cor) + u3wi_la_linspace(u3_noun cor) { u3_noun x_meta, a, b, n, rnd; @@ -3154,7 +3154,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_linspace_real(a, b, n, x_bloq); + u3_noun r_data = u3qi_la_linspace_real(a, b, n, x_bloq); x_shape = u3nt(u3x_atom(n), 0x1, u3_nul); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); @@ -3166,7 +3166,7 @@ } u3_noun - u3wf_la_range(u3_noun cor) + u3wi_la_range(u3_noun cor) { u3_noun x_meta, a, b, d, rnd; @@ -3194,7 +3194,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_range_real(a, b, d, x_bloq); + u3_noun r_data = u3qi_la_range_real(a, b, d, x_bloq); c3_d a_, b_, d_; c3_ds n_; switch (x_bloq) { @@ -3235,7 +3235,7 @@ } u3_noun - u3wf_la_diag(u3_noun cor) + u3wi_la_diag(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data; @@ -3259,7 +3259,7 @@ { return u3m_bail(c3__exit); } else { - u3_noun r_data = u3qf_la_diag(x_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_diag(x_data, x_shape, x_bloq); c3_d len_x0 = _get_dims(x_shape)[0]; return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); } @@ -3267,7 +3267,7 @@ } u3_noun - u3wf_la_trace(u3_noun cor) + u3wi_la_trace(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data; @@ -3293,7 +3293,7 @@ } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qf_la_trace_real(x_data, x_shape, x_bloq); + u3_noun r_data = u3qi_la_trace_real(x_data, x_shape, x_bloq); return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -3304,7 +3304,7 @@ } u3_noun - u3wf_la_mmul(u3_noun cor) + u3wi_la_mmul(u3_noun cor) { // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, @@ -3343,7 +3343,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qf_la_mmul_real(x_data, y_data, x_shape, y_shape, x_bloq); + u3_noun r_data = u3qi_la_mmul_real(x_data, y_data, x_shape, y_shape, x_bloq); // result is already [meta data] return r_data; diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index b5e5cbc8e1..14a4cac1e3 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -247,34 +247,34 @@ u3_noun u3qfp_nepo(u3_noun, u3_noun); u3_noun u3qfp_rake(u3_noun); - u3_noun u3qf_la_add_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_sub_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_mul_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_div_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_mod_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_adds_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_subs_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_muls_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_divs_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_mods_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_dot_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_diag(u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_transpose(u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_cumsum_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_argmin_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_argmax_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_ravel_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_min_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_max_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_linspace_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_range_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_abs_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_gth_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_gte_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_lth_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_lte_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_trace_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qf_la_mmul_real(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_add_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_sub_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mul_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_div_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mod_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_adds_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_subs_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_muls_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_divs_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mods_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_dot_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_diag(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_transpose(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_cumsum_real(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_argmin_real(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_argmax_real(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_ravel_real(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_min_real(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_max_real(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_linspace_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_range_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_abs_real(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_gth_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_gte_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_lth_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_lte_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_trace_real(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mmul_real(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); # define u3qfu_van_fan 28 # define u3qfu_van_rib 58 diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index 35573a42e6..c9118c9a4b 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2147,77 +2147,82 @@ static u3j_core _139_hex_json_d[] = {} }; -/* linear algebra jets - XX move to outer _hep_ core for /lib? eventually -static u3j_core _139_hep_d[] = +/* /lib jets in non core */ -static u3j_harm _139_hex__lagoon_add_a[] = {{".2", u3wf_la_add}, {}}; -static u3j_harm _139_hex__lagoon_sub_a[] = {{".2", u3wf_la_sub}, {}}; -static u3j_harm _139_hex__lagoon_mul_a[] = {{".2", u3wf_la_mul}, {}}; -static u3j_harm _139_hex__lagoon_div_a[] = {{".2", u3wf_la_div}, {}}; -static u3j_harm _139_hex__lagoon_mod_a[] = {{".2", u3wf_la_mod}, {}}; -static u3j_harm _139_hex__lagoon_adds_a[] = {{".2", u3wf_la_adds}, {}}; -static u3j_harm _139_hex__lagoon_subs_a[] = {{".2", u3wf_la_subs}, {}}; -static u3j_harm _139_hex__lagoon_muls_a[] = {{".2", u3wf_la_muls}, {}}; -static u3j_harm _139_hex__lagoon_divs_a[] = {{".2", u3wf_la_divs}, {}}; -static u3j_harm _139_hex__lagoon_mods_a[] = {{".2", u3wf_la_mods}, {}}; -static u3j_harm _139_hex__lagoon_dot_a[] = {{".2", u3wf_la_dot}, {}}; -static u3j_harm _139_hex__lagoon_trans_a[] ={{".2", u3wf_la_transpose}, {}}; -static u3j_harm _139_hex__lagoon_cumsum_a[]={{".2", u3wf_la_cumsum}, {}}; -static u3j_harm _139_hex__lagoon_argmin_a[]={{".2", u3wf_la_argmin}, {}}; -static u3j_harm _139_hex__lagoon_argmax_a[]={{".2", u3wf_la_argmax}, {}}; -static u3j_harm _139_hex__lagoon_ravel_a[]={{".2", u3wf_la_ravel}, {}}; -static u3j_harm _139_hex__lagoon_min_a[] = {{".2", u3wf_la_min}, {}}; -static u3j_harm _139_hex__lagoon_max_a[] = {{".2", u3wf_la_max}, {}}; -static u3j_harm _139_hex__lagoon_linspace_a[]={{".2", u3wf_la_linspace}, {}}; -static u3j_harm _139_hex__lagoon_range_a[]= {{".2", u3wf_la_range}, {}}; -static u3j_harm _139_hex__lagoon_abs_a[] = {{".2", u3wf_la_abs}, {}}; -static u3j_harm _139_hex__lagoon_gth_a[] = {{".2", u3wf_la_gth}, {}}; -static u3j_harm _139_hex__lagoon_gte_a[] = {{".2", u3wf_la_gte}, {}}; -static u3j_harm _139_hex__lagoon_lth_a[] = {{".2", u3wf_la_lth}, {}}; -static u3j_harm _139_hex__lagoon_lte_a[] = {{".2", u3wf_la_lte}, {}}; -static u3j_harm _139_hex__lagoon_diag_a[] = {{".2", u3wf_la_diag}, {}}; -static u3j_harm _139_hex__lagoon_trace_a[]= {{".2", u3wf_la_trace}, {}}; -static u3j_harm _139_hex__lagoon_mmul_a[] = {{".2", u3wf_la_mmul}, {}}; -static u3j_core _139_hex__la_core_d[] = - { { "add-rays", 7, _139_hex__lagoon_add_a, 0, no_hashes }, - { "sub-rays", 7, _139_hex__lagoon_sub_a, 0, no_hashes }, - { "mul-rays", 7, _139_hex__lagoon_mul_a, 0, no_hashes }, - { "div-rays", 7, _139_hex__lagoon_div_a, 0, no_hashes }, - { "mod-rays", 7, _139_hex__lagoon_mod_a, 0, no_hashes }, - { "add-scal", 7, _139_hex__lagoon_adds_a, 0, no_hashes }, - { "sub-scal", 7, _139_hex__lagoon_subs_a, 0, no_hashes }, - { "mul-scal", 7, _139_hex__lagoon_muls_a, 0, no_hashes }, - { "div-scal", 7, _139_hex__lagoon_divs_a, 0, no_hashes }, - { "mod-scal", 7, _139_hex__lagoon_mods_a, 0, no_hashes }, - { "dot", 7, _139_hex__lagoon_dot_a, 0, no_hashes }, - { "transpose",7, _139_hex__lagoon_trans_a, 0, no_hashes }, - { "cumsum", 7, _139_hex__lagoon_cumsum_a, 0, no_hashes }, - { "argmin", 7, _139_hex__lagoon_argmin_a, 0, no_hashes }, - { "argmax", 7, _139_hex__lagoon_argmax_a, 0, no_hashes }, - { "ravel", 7, _139_hex__lagoon_ravel_a, 0, no_hashes }, - { "min", 7, _139_hex__lagoon_min_a, 0, no_hashes }, - { "max", 7, _139_hex__lagoon_max_a, 0, no_hashes }, - { "linspace", 7, _139_hex__lagoon_linspace_a, 0, no_hashes }, - { "range", 7, _139_hex__lagoon_range_a, 0, no_hashes }, - { "abs", 7, _139_hex__lagoon_abs_a, 0, no_hashes }, - { "gth", 7, _139_hex__lagoon_gth_a, 0, no_hashes }, - { "gte", 7, _139_hex__lagoon_gte_a, 0, no_hashes }, - { "lth", 7, _139_hex__lagoon_lth_a, 0, no_hashes }, - { "lte", 7, _139_hex__lagoon_lte_a, 0, no_hashes }, - { "diag", 7, _139_hex__lagoon_diag_a, 0, no_hashes }, - { "trace", 7, _139_hex__lagoon_trace_a,0, no_hashes }, - { "mmul", 7, _139_hex__lagoon_mmul_a, 0, no_hashes }, +static u3j_harm _139_non__lagoon_add_a[] = {{".2", u3wi_la_add, c3n}, {}}; +static u3j_harm _139_non__lagoon_sub_a[] = {{".2", u3wi_la_sub, c3n}, {}}; +static u3j_harm _139_non__lagoon_mul_a[] = {{".2", u3wi_la_mul, c3n}, {}}; +static u3j_harm _139_non__lagoon_div_a[] = {{".2", u3wi_la_div, c3n}, {}}; +static u3j_harm _139_non__lagoon_mod_a[] = {{".2", u3wi_la_mod, c3n}, {}}; +static u3j_harm _139_non__lagoon_adds_a[] = {{".2", u3wi_la_adds, c3n}, {}}; +static u3j_harm _139_non__lagoon_subs_a[] = {{".2", u3wi_la_subs, c3n}, {}}; +static u3j_harm _139_non__lagoon_muls_a[] = {{".2", u3wi_la_muls, c3n}, {}}; +static u3j_harm _139_non__lagoon_divs_a[] = {{".2", u3wi_la_divs, c3n}, {}}; +static u3j_harm _139_non__lagoon_mods_a[] = {{".2", u3wi_la_mods, c3n}, {}}; +static u3j_harm _139_non__lagoon_dot_a[] = {{".2", u3wi_la_dot, c3n}, {}}; +static u3j_harm _139_non__lagoon_trans_a[] ={{".2", u3wi_la_transpose, c3n}, {}}; +static u3j_harm _139_non__lagoon_cumsum_a[]={{".2", u3wi_la_cumsum, c3n}, {}}; +static u3j_harm _139_non__lagoon_argmin_a[]={{".2", u3wi_la_argmin, c3n}, {}}; +static u3j_harm _139_non__lagoon_argmax_a[]={{".2", u3wi_la_argmax, c3n}, {}}; +static u3j_harm _139_non__lagoon_ravel_a[]={{".2", u3wi_la_ravel, c3n}, {}}; +static u3j_harm _139_non__lagoon_min_a[] = {{".2", u3wi_la_min, c3n}, {}}; +static u3j_harm _139_non__lagoon_max_a[] = {{".2", u3wi_la_max, c3n}, {}}; +static u3j_harm _139_non__lagoon_linspace_a[]={{".2", u3wi_la_linspace, c3n}, {}}; +static u3j_harm _139_non__lagoon_range_a[]= {{".2", u3wi_la_range, c3n}, {}}; +static u3j_harm _139_non__lagoon_abs_a[] = {{".2", u3wi_la_abs, c3n}, {}}; +static u3j_harm _139_non__lagoon_gth_a[] = {{".2", u3wi_la_gth, c3n}, {}}; +static u3j_harm _139_non__lagoon_gte_a[] = {{".2", u3wi_la_gte, c3n}, {}}; +static u3j_harm _139_non__lagoon_lth_a[] = {{".2", u3wi_la_lth, c3n}, {}}; +static u3j_harm _139_non__lagoon_lte_a[] = {{".2", u3wi_la_lte, c3n}, {}}; +static u3j_harm _139_non__lagoon_diag_a[] = {{".2", u3wi_la_diag, c3n}, {}}; +static u3j_harm _139_non__lagoon_trace_a[]= {{".2", u3wi_la_trace, c3n}, {}}; +static u3j_harm _139_non__lagoon_mmul_a[] = {{".2", u3wi_la_mmul, c3n}, {}}; +static u3j_core _139_non__la_core_d[] = + { { "add-rays", 7, _139_non__lagoon_add_a, 0, no_hashes }, + { "sub-rays", 7, _139_non__lagoon_sub_a, 0, no_hashes }, + { "mul-rays", 7, _139_non__lagoon_mul_a, 0, no_hashes }, + { "div-rays", 7, _139_non__lagoon_div_a, 0, no_hashes }, + { "mod-rays", 7, _139_non__lagoon_mod_a, 0, no_hashes }, + { "add-scal", 7, _139_non__lagoon_adds_a, 0, no_hashes }, + { "sub-scal", 7, _139_non__lagoon_subs_a, 0, no_hashes }, + { "mul-scal", 7, _139_non__lagoon_muls_a, 0, no_hashes }, + { "div-scal", 7, _139_non__lagoon_divs_a, 0, no_hashes }, + { "mod-scal", 7, _139_non__lagoon_mods_a, 0, no_hashes }, + { "dot", 7, _139_non__lagoon_dot_a, 0, no_hashes }, + { "transpose",7, _139_non__lagoon_trans_a, 0, no_hashes }, + { "cumsum", 7, _139_non__lagoon_cumsum_a, 0, no_hashes }, + { "argmin", 7, _139_non__lagoon_argmin_a, 0, no_hashes }, + { "argmax", 7, _139_non__lagoon_argmax_a, 0, no_hashes }, + { "ravel", 7, _139_non__lagoon_ravel_a, 0, no_hashes }, + { "min", 7, _139_non__lagoon_min_a, 0, no_hashes }, + { "max", 7, _139_non__lagoon_max_a, 0, no_hashes }, + { "linspace", 7, _139_non__lagoon_linspace_a, 0, no_hashes }, + { "range", 7, _139_non__lagoon_range_a, 0, no_hashes }, + { "abs", 7, _139_non__lagoon_abs_a, 0, no_hashes }, + { "gth", 7, _139_non__lagoon_gth_a, 0, no_hashes }, + { "gte", 7, _139_non__lagoon_gte_a, 0, no_hashes }, + { "lth", 7, _139_non__lagoon_lth_a, 0, no_hashes }, + { "lte", 7, _139_non__lagoon_lte_a, 0, no_hashes }, + { "diag", 7, _139_non__lagoon_diag_a, 0, no_hashes }, + { "trace", 7, _139_non__lagoon_trace_a,0, no_hashes }, + { "mmul", 7, _139_non__lagoon_mmul_a, 0, no_hashes }, {} }; -static u3j_core _139_hex__lagoon_d[] = - { { "la-core", 7, 0, _139_hex__la_core_d, no_hashes }, +static u3j_core _139_non__lagoon_d[] = + { { "la-core", 7, 0, _139_non__la_core_d, no_hashes }, + {} + }; + +static u3j_core _139_non_d[] = + { { "lagoon", 6, 0, _139_non__lagoon_d, no_hashes }, {} }; static u3j_core _139_hex_d[] = -{ { "lore", 63, _140_hex_lore_a, 0, no_hashes }, +{ { "sep", 7, 0, _139_non_d, no_hashes }, + + { "lore", 63, _140_hex_lore_a, 0, no_hashes }, { "leer", 63, _140_hex_leer_a, 0, no_hashes }, { "loss", 63, _140_hex_loss_a, 0, no_hashes }, { "lune", 127, _140_hex_lune_a, 0, no_hashes }, @@ -2235,7 +2240,6 @@ static u3j_core _139_hex_d[] = { "mimes", 31, 0, _140_hex_mimes_d, no_hashes }, { "json", 31, 0, _139_hex_json_d, no_hashes }, - { "lagoon", 31, 0, _139_hex__lagoon_d, no_hashes }, {} }; diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index e7976ba3b7..01948a851e 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -331,34 +331,34 @@ u3_noun u3wfu_repo(u3_noun); u3_noun u3wfu_rest(u3_noun); - u3_noun u3wf_la_add(u3_noun); - u3_noun u3wf_la_sub(u3_noun); - u3_noun u3wf_la_mul(u3_noun); - u3_noun u3wf_la_div(u3_noun); - u3_noun u3wf_la_mod(u3_noun); - u3_noun u3wf_la_adds(u3_noun); - u3_noun u3wf_la_subs(u3_noun); - u3_noun u3wf_la_muls(u3_noun); - u3_noun u3wf_la_divs(u3_noun); - u3_noun u3wf_la_mods(u3_noun); - u3_noun u3wf_la_dot(u3_noun); - u3_noun u3wf_la_diag(u3_noun); - u3_noun u3wf_la_transpose(u3_noun); - u3_noun u3wf_la_cumsum(u3_noun); - u3_noun u3wf_la_argmin(u3_noun); - u3_noun u3wf_la_argmax(u3_noun); - u3_noun u3wf_la_ravel(u3_noun); - u3_noun u3wf_la_min(u3_noun); - u3_noun u3wf_la_max(u3_noun); - u3_noun u3wf_la_linspace(u3_noun); - u3_noun u3wf_la_range(u3_noun); - u3_noun u3wf_la_abs(u3_noun); - u3_noun u3wf_la_gth(u3_noun); - u3_noun u3wf_la_gte(u3_noun); - u3_noun u3wf_la_lth(u3_noun); - u3_noun u3wf_la_lte(u3_noun); - - u3_noun u3wf_la_trace(u3_noun); - u3_noun u3wf_la_mmul(u3_noun); + u3_noun u3wi_la_add(u3_noun); + u3_noun u3wi_la_sub(u3_noun); + u3_noun u3wi_la_mul(u3_noun); + u3_noun u3wi_la_div(u3_noun); + u3_noun u3wi_la_mod(u3_noun); + u3_noun u3wi_la_adds(u3_noun); + u3_noun u3wi_la_subs(u3_noun); + u3_noun u3wi_la_muls(u3_noun); + u3_noun u3wi_la_divs(u3_noun); + u3_noun u3wi_la_mods(u3_noun); + u3_noun u3wi_la_dot(u3_noun); + u3_noun u3wi_la_diag(u3_noun); + u3_noun u3wi_la_transpose(u3_noun); + u3_noun u3wi_la_cumsum(u3_noun); + u3_noun u3wi_la_argmin(u3_noun); + u3_noun u3wi_la_argmax(u3_noun); + u3_noun u3wi_la_ravel(u3_noun); + u3_noun u3wi_la_min(u3_noun); + u3_noun u3wi_la_max(u3_noun); + u3_noun u3wi_la_linspace(u3_noun); + u3_noun u3wi_la_range(u3_noun); + u3_noun u3wi_la_abs(u3_noun); + u3_noun u3wi_la_gth(u3_noun); + u3_noun u3wi_la_gte(u3_noun); + u3_noun u3wi_la_lth(u3_noun); + u3_noun u3wi_la_lte(u3_noun); + + u3_noun u3wi_la_trace(u3_noun); + u3_noun u3wi_la_mmul(u3_noun); #endif /* ifndef U3_JETS_W_H */ From fe485ea43e4106840c90c3692ffb30947302e98c Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 24 Apr 2024 10:57:47 -0500 Subject: [PATCH 26/97] Attempt ARM build with same files as x86_64. --- bazel/third_party/softfloat/softfloat.BUILD | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/bazel/third_party/softfloat/softfloat.BUILD b/bazel/third_party/softfloat/softfloat.BUILD index 0d57ba5d7f..09f7b21370 100644 --- a/bazel/third_party/softfloat/softfloat.BUILD +++ b/bazel/third_party/softfloat/softfloat.BUILD @@ -235,6 +235,25 @@ cc_library( "source/extF80M_eq_signaling.c", "source/extF80M_le_quiet.c", "source/extF80M_lt_quiet.c", + "source/f128_to_f16.c", + "source/f128_to_f32.c", + "source/f128_to_extF80.c", + "source/f128_to_f64.c", + "source/f128_roundToInt.c", + "source/f128_add.c", + "source/f128_sub.c", + "source/f128_mul.c", + "source/f128_mulAdd.c", + "source/f128_div.c", + "source/f128_rem.c", + "source/f128_sqrt.c", + "source/f128_eq.c", + "source/f128_le.c", + "source/f128_lt.c", + "source/f128_eq_signaling.c", + "source/f128_le_quiet.c", + "source/f128_lt_quiet.c", + "source/f128_isSignalingNaN.c", "source/f128M_to_ui32.c", "source/f128M_to_ui64.c", "source/f128M_to_i32.c", From bf13e281cdc815bcfe5bfef9538f3a6104435efb Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 24 Apr 2024 11:04:24 -0500 Subject: [PATCH 27/97] Attempt ARM build with same files as x86_64. --- bazel/third_party/softfloat/softfloat.BUILD | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bazel/third_party/softfloat/softfloat.BUILD b/bazel/third_party/softfloat/softfloat.BUILD index 09f7b21370..cf01a9dcf2 100644 --- a/bazel/third_party/softfloat/softfloat.BUILD +++ b/bazel/third_party/softfloat/softfloat.BUILD @@ -20,6 +20,9 @@ cc_library( srcs = [ # See `OBJS_PRIMITIVES` in `build/Linux-ARM-VFPv2-GCC/Makefile` in the # `softfloat` repo. + "source/s_eq128.c", + "source/s_le128.c", + "source/s_lt128.c", "source/s_compare96M.c", "source/s_compare128M.c", "source/s_shortShiftLeft64To96M.c", From 4832fbe06cf1c7201090f7b5fe28b3d763b5f4a5 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 24 Apr 2024 11:46:51 -0500 Subject: [PATCH 28/97] Swap out SoftFloat fns. --- bazel/third_party/softfloat/softfloat.BUILD | 22 ----------------- pkg/noun/jets/i/lagoon.c | 26 +++++++++++++-------- 2 files changed, 16 insertions(+), 32 deletions(-) diff --git a/bazel/third_party/softfloat/softfloat.BUILD b/bazel/third_party/softfloat/softfloat.BUILD index cf01a9dcf2..0d57ba5d7f 100644 --- a/bazel/third_party/softfloat/softfloat.BUILD +++ b/bazel/third_party/softfloat/softfloat.BUILD @@ -20,9 +20,6 @@ cc_library( srcs = [ # See `OBJS_PRIMITIVES` in `build/Linux-ARM-VFPv2-GCC/Makefile` in the # `softfloat` repo. - "source/s_eq128.c", - "source/s_le128.c", - "source/s_lt128.c", "source/s_compare96M.c", "source/s_compare128M.c", "source/s_shortShiftLeft64To96M.c", @@ -238,25 +235,6 @@ cc_library( "source/extF80M_eq_signaling.c", "source/extF80M_le_quiet.c", "source/extF80M_lt_quiet.c", - "source/f128_to_f16.c", - "source/f128_to_f32.c", - "source/f128_to_extF80.c", - "source/f128_to_f64.c", - "source/f128_roundToInt.c", - "source/f128_add.c", - "source/f128_sub.c", - "source/f128_mul.c", - "source/f128_mulAdd.c", - "source/f128_div.c", - "source/f128_rem.c", - "source/f128_sqrt.c", - "source/f128_eq.c", - "source/f128_le.c", - "source/f128_lt.c", - "source/f128_eq_signaling.c", - "source/f128_le_quiet.c", - "source/f128_lt_quiet.c", - "source/f128_isSignalingNaN.c", "source/f128M_to_ui32.c", "source/f128M_to_ui64.c", "source/f128M_to_i32.c", diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index be9b8f6791..f0906cd20a 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -385,7 +385,7 @@ // Perform division x/n float16_t div_result16 = f16_div(x_val16, y_val16); // Compute floor of the division result - int64_t floor_result16 = f16_to_i64(div_result16, softfloat_round_minMag, false); + c3_ds floor_result16 = f16_to_i64(div_result16, softfloat_round_minMag, false); float16_t floor_float16 = i64_to_f16(floor_result16); // Multiply n by floor(x/n) float16_t mult_result16 = f16_mul(y_val16, floor_float16); @@ -401,7 +401,7 @@ // Perform division x/n float32_t div_result32 = f32_div(x_val32, y_val32); // Compute floor of the division result - int64_t floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false); + c3_ds floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false); float32_t floor_float32 = i64_to_f32(floor_result32); // Multiply n by floor(x/n) float32_t mult_result32 = f32_mul(y_val32, floor_float32); @@ -417,7 +417,7 @@ // Perform division x/n float64_t div_result64 = f64_div(x_val64, y_val64); // Compute floor of the division result - int64_t floor_result64 = f64_to_i64(div_result64, softfloat_round_minMag, false); + c3_ds floor_result64 = f64_to_i64(div_result64, softfloat_round_minMag, false); float64_t floor_float64 = i64_to_f64(floor_result64); // Multiply n by floor(x/n) float64_t mult_result64 = f64_mul(y_val64, floor_float64); @@ -434,7 +434,7 @@ float128_t div_result128; f128M_div((float128_t*)&x_val128, (float128_t*)&y_val128, (float128_t*)&div_result128); // Compute floor of the division result - int64_t floor_result128 = f128_to_i64(div_result128, softfloat_round_minMag, false); + c3_ds floor_result128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false); float128_t floor_float128 = i64_to_f128(floor_result128); // Multiply n by floor(x/n) float128_t mult_result128; @@ -1586,7 +1586,7 @@ // Perform division x/n float16_t div_result16 = f16_mul(in16, x_val16); // Compute floor of the division result - int64_t floor_result16 = f16_to_i64(div_result16, softfloat_round_minMag, false); + c3_ds floor_result16 = f16_to_i64(div_result16, softfloat_round_minMag, false); float16_t floor_float16 = i64_to_f16(floor_result16); // Multiply n by floor(x/n) float16_t mult_result16 = f16_mul(n16, floor_float16); @@ -1604,7 +1604,7 @@ // Perform division x/n float32_t div_result32 = f32_mul((float32_t)in32, (float32_t)x_val32); // Compute floor of the division result - int64_t floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false); + c3_ds floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false); float32_t floor_float32 = i64_to_f32(floor_result32); // Multiply n by floor(x/n) float32_t mult_result32 = f32_mul(n32, floor_float32); @@ -1622,7 +1622,7 @@ // Perform division x/n float64_t div_result64 = f64_mul(in64, x_val64); // Compute floor of the division result - int64_t floor_result64 = f64_to_i64(div_result64, softfloat_round_minMag, false); + c3_ds floor_result64 = f64_to_i64(div_result64, softfloat_round_minMag, false); float64_t floor_float64 = i64_to_f64(floor_result64); // Multiply n by floor(x/n) float64_t mult_result64 = f64_mul(n64, floor_float64); @@ -1641,7 +1641,7 @@ float128_t div_result128; f128M_mul((float128_t*)&in128, (float128_t*)&x_val128, (float128_t*)&div_result128); // Compute floor of the division result - int64_t floor_result128 = f128_to_i64(div_result128, softfloat_round_minMag, false); + c3_ds floor_result128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false); float128_t floor_float128 = i64_to_f128(floor_result128); // Multiply n by floor(x/n) float128_t mult_result128; @@ -1992,7 +1992,10 @@ u3r_bytes(0, 16, (c3_y*)&(a128.v[0]), a); u3r_bytes(0, 16, (c3_y*)&(b128.v[0]), b); u3r_bytes(0, 16, (c3_y*)&(interval128.v), d); - c3_d n128 = f128_to_i64(f128_div(f128_sub(b128, a128), interval128), softfloat_round_minMag, false); + float128_t tmp; + f128M_sub(&b128, &a128, &tmp); + f128M_div(&tmp, &interval128, &interval128); + c3_d n128 = f128M_to_i64(&tmp, softfloat_round_minMag, false); c3_y* x_bytes128 = (c3_y*)u3a_malloc(((n128+1)*16+1)*sizeof(c3_y)); float128_t i128; for (c3_d i = 1; i < n128; i++) { @@ -3220,7 +3223,10 @@ u3r_bytes(0, 16, (c3_y*)&a_, a); u3r_bytes(0, 16, (c3_y*)&b_, b); u3r_bytes(0, 16, (c3_y*)&d_, d); - n_ = f128_to_i64(f128_div(f128_sub((float128_t){b_}, (float128_t){a_}), (float128_t){d_}), softfloat_round_minMag, false); + float128_t tmp; + f128M_sub((float128_t*){&b_}, (float128_t*){&a_}, &tmp); + f128M_div(&tmp, (float128_t*){&d_}, &tmp); + n_ = f128M_to_i64(&tmp, softfloat_round_minMag, false); break; } u3_noun n = u3i_chub(n_+1); From 04a33f757043b75ffd46236b646a7782c044eec9 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 24 Apr 2024 11:51:00 -0500 Subject: [PATCH 29/97] Swap out SoftFloat fns. --- pkg/noun/jets/i/lagoon.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index f0906cd20a..b5fe75f298 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -435,7 +435,8 @@ f128M_div((float128_t*)&x_val128, (float128_t*)&y_val128, (float128_t*)&div_result128); // Compute floor of the division result c3_ds floor_result128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false); - float128_t floor_float128 = i64_to_f128(floor_result128); + float128_t floor_float128; + i64_to_f128M(floor_result128, &floor_float128); // Multiply n by floor(x/n) float128_t mult_result128; f128M_mul(((float128_t*)&y_val128), ((float128_t*)&floor_float128), ((float128_t*)&mult_result128)); @@ -1642,7 +1643,8 @@ f128M_mul((float128_t*)&in128, (float128_t*)&x_val128, (float128_t*)&div_result128); // Compute floor of the division result c3_ds floor_result128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false); - float128_t floor_float128 = i64_to_f128(floor_result128); + float128_t floor_float128; + i64_to_f128M(floor_result128, &floor_float128); // Multiply n by floor(x/n) float128_t mult_result128; f128M_mul(((float128_t*)&n128), ((float128_t*)&floor_float128), ((float128_t*)&mult_result128)); From 8b76ac5ab77292080312b1516afb74e3117075b2 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 24 Apr 2024 11:55:46 -0500 Subject: [PATCH 30/97] Swap out SoftFloat fns. --- pkg/noun/jets/i/lagoon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index b5fe75f298..502cc6af2e 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -590,7 +590,7 @@ case 7: ; float128_t min_val128 = ((float128_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { - if(f128_lt(((float128_t*)x_bytes)[i], min_val128)) { + if(f128M_lt(&(((float128_t*)x_bytes)[i]), &min_val128)) { min_val128 = *f128M_min(&min_val128, &((float128_t*)x_bytes)[i]); min_idx = (len_x - i - 1); } From fdd6951712135afbcafab3afa94cd2e654f183ed Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 24 Apr 2024 11:59:29 -0500 Subject: [PATCH 31/97] Swap out SoftFloat fns. --- pkg/noun/jets/i/lagoon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index 502cc6af2e..2bfe195b3c 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -663,7 +663,7 @@ case 7: ; float128_t max_val128 = ((float128_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { - if(f128_gt(((float128_t*)x_bytes)[i], max_val128)) { + if(f128M_gt(&(((float128_t*)x_bytes)[i]), &max_val128)) { max_val128 = *f128M_max(&max_val128, &((float128_t*)x_bytes)[i]); max_idx = (len_x - i - 1); } From d078a314d24464fd3d51e604f8a96609a2576cf5 Mon Sep 17 00:00:00 2001 From: Pyry Kovanen Date: Thu, 25 Apr 2024 19:37:33 +0300 Subject: [PATCH 32/97] ci: install pkg-config in linux-aarch64 --- .github/workflows/shared.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/shared.yml b/.github/workflows/shared.yml index aee47884a5..bd8044e580 100644 --- a/.github/workflows/shared.yml +++ b/.github/workflows/shared.yml @@ -113,7 +113,7 @@ jobs: run: | case "${{ matrix.target }}" in "linux-aarch64") - sudo apt-get -y install autoconf-archive + sudo apt-get -y install autoconf-archive pkg-config bazel run //bazel/toolchain:aarch64-linux-musl-gcc ;; "linux-x86_64") From 8b3122b0ab9ea09aa91d00b1163d2bb046e69905 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Fri, 24 May 2024 15:11:13 -0500 Subject: [PATCH 33/97] Update for SoftBLAS version. --- WORKSPACE.bazel | 2 +- pkg/noun/jets/i/lagoon.c | 224 +++++++++++++++++++++------------------ pkg/noun/jets/tree.c | 2 +- 3 files changed, 124 insertions(+), 104 deletions(-) diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index c904a63eff..ec6a637114 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -297,7 +297,7 @@ versioned_http_archive( strip_prefix = "SoftBLAS-{version}", # sha256 = "", url = "https://github.com/urbit/SoftBLAS/archive/{version}.tar.gz", - version = "7d05697aea5363dcf5f877a9c8b464e9c352d3d4", + version = "29daa2f2fd0ad5070e405ad287f3623804f8fc67", ) versioned_http_archive( diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index 2bfe195b3c..460058ecc7 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -98,6 +98,16 @@ return dims; } +/* soft check on u3_none return from q jet +*/ + static inline u3_noun _soft_run(u3_noun a) + { + if (u3_none == a) { + u3m_bail(c3__fail); + } + return a; + } + /* add - axpy = 1*x+y */ u3_noun @@ -125,7 +135,8 @@ // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); - u3r_bytes(0, syz_x+1, y_bytes, y_data); + u3r_bytes(0, syz_x, y_bytes, y_data); + y_bytes[syz_x] = 0x1; // Switch on the block size. switch (u3x_atom(bloq)) { @@ -183,8 +194,9 @@ // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); - u3r_bytes(0, syz_x+1, y_bytes, y_data); - + u3r_bytes(0, syz_x, y_bytes, y_data); + y_bytes[syz_x] = 0x1; + // Switch on the block size. switch (u3x_atom(bloq)) { case 4: @@ -242,7 +254,8 @@ // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); - u3r_bytes(0, syz_x+1, y_bytes, y_data); + u3r_bytes(0, syz_x, y_bytes, y_data); + y_bytes[syz_x] = 0x1; // Switch on the block size. switch (u3x_atom(bloq)) { @@ -308,7 +321,8 @@ // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); - u3r_bytes(0, syz_x+1, y_bytes, y_data); + u3r_bytes(0, syz_x, y_bytes, y_data); + y_bytes[syz_x] = 0x1; // Switch on the block size. switch (u3x_atom(bloq)) { @@ -374,7 +388,8 @@ // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); - u3r_bytes(0, syz_x+1, y_bytes, y_data); + u3r_bytes(0, syz_x, y_bytes, y_data); + y_bytes[syz_x] = 0x1; // Switch on the block size. switch (u3x_atom(bloq)) { @@ -557,44 +572,44 @@ // Switch on the block size. switch (u3x_atom(bloq)) { - case 4: ; + case 4: { float16_t min_val16 = ((float16_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { if(f16_lt(((float16_t*)x_bytes)[i], min_val16)) { min_val16 = ((float16_t*)x_bytes)[i]; min_idx = (len_x - i - 1); } - } + } } break; - case 5: ; + case 5: { float32_t min_val32 = ((float32_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { if(f32_lt(((float32_t*)x_bytes)[i], min_val32)) { min_val32 = ((float32_t*)x_bytes)[i]; min_idx = (len_x - i - 1); } - } + } } break; - case 6: ; + case 6: { float64_t min_val64 = ((float64_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { if(f64_lt(((float64_t*)x_bytes)[i], min_val64)) { min_val64 = ((float64_t*)x_bytes)[i]; min_idx = (len_x - i - 1); } - } + } } break; - case 7: ; + case 7: { float128_t min_val128 = ((float128_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { if(f128M_lt(&(((float128_t*)x_bytes)[i]), &min_val128)) { min_val128 = *f128M_min(&min_val128, &((float128_t*)x_bytes)[i]); min_idx = (len_x - i - 1); } - } + } } break; } @@ -630,44 +645,44 @@ // Switch on the block size. switch (u3x_atom(bloq)) { - case 4: ; + case 4: { float16_t max_val16 = ((float16_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { if(f16_gt(((float16_t*)x_bytes)[i], max_val16)) { max_val16 = ((float16_t*)x_bytes)[i]; max_idx = (len_x - i - 1); } - } + } } break; - case 5: ; + case 5: { float32_t max_val32 = ((float32_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { if(f32_gt(((float32_t*)x_bytes)[i], max_val32)) { max_val32 = ((float32_t*)x_bytes)[i]; max_idx = (len_x - i - 1); } - } + } } break; - case 6: ; + case 6: { float64_t max_val64 = ((float64_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { if(f64_gt(((float64_t*)x_bytes)[i], max_val64)) { max_val64 = ((float64_t*)x_bytes)[i]; max_idx = (len_x - i - 1); } - } + } } break; - case 7: ; + case 7: { float128_t max_val128 = ((float128_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { if(f128M_gt(&(((float128_t*)x_bytes)[i]), &max_val128)) { max_val128 = *f128M_max(&max_val128, &((float128_t*)x_bytes)[i]); max_idx = (len_x - i - 1); } - } + } } break; } @@ -1744,12 +1759,12 @@ } // Assert length of dims is 2. if (u3qb_lent(shape) != 2) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } // Unpack shape into an array of dimensions. c3_d *dims = _get_dims(shape); if (dims[0] != dims[1]) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } // Unpack the data as a byte array. We assume total length < 2**64. @@ -1791,7 +1806,7 @@ { // Assert length of dims is 2. if (u3qb_lent(shape) != 2) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } // Unpack shape into an array of dimensions. c3_d *dims = _get_dims(shape); @@ -2044,10 +2059,15 @@ c3_d Nb= u3x_atom(u3h(y_shape)); c3_d P = u3x_atom(u3h(u3t(y_shape))); + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + if ((u3_nul != u3t(u3t(x_shape))) || (u3_nul != u3t(u3t(y_shape))) || (Na != Nb)) { - return u3m_bail(c3__exit); + return u3_none; } c3_d N = Na; @@ -2133,7 +2153,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2157,12 +2177,12 @@ // fxp does not need to match here so no check ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qi_la_add_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_add_real(x_data, y_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2188,7 +2208,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2212,12 +2232,12 @@ // fxp does not need to match here so no check ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qi_la_sub_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_sub_real(x_data, y_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2243,7 +2263,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2267,12 +2287,12 @@ // fxp does not need to match here so no check ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qi_la_mul_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_mul_real(x_data, y_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2298,7 +2318,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2322,12 +2342,12 @@ // fxp does not need to match here so no check ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qi_la_div_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_div_real(x_data, y_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2353,7 +2373,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2377,12 +2397,12 @@ // fxp does not need to match here so no check ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qi_la_mod_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_mod_real(x_data, y_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2404,7 +2424,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; @@ -2417,12 +2437,12 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qi_la_cumsum_real(x_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_cumsum_real(x_data, x_shape, x_bloq)); return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2444,7 +2464,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -2455,7 +2475,7 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: ; @@ -2482,7 +2502,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -2492,11 +2512,11 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qi_la_ravel_real(x_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_ravel_real(x_data, x_shape, x_bloq)); // (list @) return r_data; @@ -2519,7 +2539,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -2530,7 +2550,7 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: ; @@ -2557,7 +2577,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -2568,11 +2588,11 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qi_la_min_real(x_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_min_real(x_data, x_shape, x_bloq)); return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2594,7 +2614,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -2605,11 +2625,11 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qi_la_max_real(x_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_max_real(x_data, x_shape, x_bloq)); return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2631,7 +2651,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -2642,11 +2662,11 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qi_la_abs_real(x_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_abs_real(x_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2672,7 +2692,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2695,11 +2715,11 @@ c3n == u3r_sing(x_fxp, y_fxp) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qi_la_gth_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_gth_real(x_data, y_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2725,7 +2745,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2748,11 +2768,11 @@ c3n == u3r_sing(x_fxp, y_fxp) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qi_la_gte_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_gte_real(x_data, y_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2778,7 +2798,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2801,11 +2821,11 @@ c3n == u3r_sing(x_fxp, y_fxp) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qi_la_lth_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_lth_real(x_data, y_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2831,7 +2851,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2854,11 +2874,11 @@ c3n == u3r_sing(x_fxp, y_fxp) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qi_la_lte_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_lte_real(x_data, y_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2882,7 +2902,7 @@ c3n == u3ud(x_data) || c3n == u3ud(n) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; @@ -2894,7 +2914,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qi_la_adds_real(x_data, n, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_adds_real(x_data, n, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2917,7 +2937,7 @@ c3n == u3ud(x_data) || c3n == u3ud(n) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; @@ -2929,7 +2949,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qi_la_subs_real(x_data, n, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_subs_real(x_data, n, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2952,7 +2972,7 @@ c3n == u3ud(x_data) || c3n == u3ud(n) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; @@ -2964,7 +2984,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qi_la_muls_real(x_data, n, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_muls_real(x_data, n, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2987,7 +3007,7 @@ c3n == u3ud(x_data) || c3n == u3ud(n) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; @@ -2999,7 +3019,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qi_la_divs_real(x_data, n, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_divs_real(x_data, n, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -3022,7 +3042,7 @@ c3n == u3ud(x_data) || c3n == u3ud(n) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; @@ -3034,7 +3054,7 @@ switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qi_la_mods_real(x_data, n, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_mods_real(x_data, n, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -3059,7 +3079,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -3083,12 +3103,12 @@ c3n == u3r_sing(x_fxp, y_fxp) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qi_la_dot_real(x_data, y_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_dot_real(x_data, y_data, x_shape, x_bloq)); c3_d len_x0 = _get_dims(x_shape)[0]; return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); @@ -3111,7 +3131,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -3122,7 +3142,7 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun r_data = u3qi_la_transpose(x_data, x_shape, x_bloq); return u3nc(u3nq(u3nt(u3k(u3h(x_shape)), u3k(u3h(u3t(x_shape))), u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); @@ -3142,7 +3162,7 @@ u3x_sam_7, &n, 0)) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -3154,12 +3174,12 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qi_la_linspace_real(a, b, n, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_linspace_real(a, b, n, x_bloq)); x_shape = u3nt(u3x_atom(n), 0x1, u3_nul); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); @@ -3182,7 +3202,7 @@ u3x_sam_7, &d, 0)) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -3194,12 +3214,12 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qi_la_range_real(a, b, d, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_range_real(a, b, d, x_bloq)); c3_d a_, b_, d_; c3_ds n_; switch (x_bloq) { @@ -3254,7 +3274,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -3265,9 +3285,9 @@ c3n == u3ud(x_kind) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { - u3_noun r_data = u3qi_la_diag(x_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_diag(x_data, x_shape, x_bloq)); c3_d len_x0 = _get_dims(x_shape)[0]; return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); } @@ -3286,7 +3306,7 @@ 0) || c3n == u3ud(x_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; if ( c3n == u3r_mean(x_meta, @@ -3297,11 +3317,11 @@ 0) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: ; - u3_noun r_data = u3qi_la_trace_real(x_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_trace_real(x_data, x_shape, x_bloq)); return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -3327,7 +3347,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -3346,12 +3366,12 @@ // fxp does not need to match so no check ) { - return u3m_bail(c3__exit); + u3m_bail(c3__exit); } else { switch (x_kind) { case c3__real: _set_rounding(rnd); - u3_noun r_data = u3qi_la_mmul_real(x_data, y_data, x_shape, y_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_mmul_real(x_data, y_data, x_shape, y_shape, x_bloq)); // result is already [meta data] return r_data; diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index c9118c9a4b..48ef0db9f6 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2220,7 +2220,7 @@ static u3j_core _139_non_d[] = }; static u3j_core _139_hex_d[] = -{ { "sep", 7, 0, _139_non_d, no_hashes }, +{ { "non", 7, 0, _139_non_d, no_hashes }, { "lore", 63, _140_hex_lore_a, 0, no_hashes }, { "leer", 63, _140_hex_leer_a, 0, no_hashes }, From 3f5c6d613aaea345d0fc08105fc2cef4eac4869c Mon Sep 17 00:00:00 2001 From: Quodss Date: Wed, 8 Nov 2023 16:48:09 +0100 Subject: [PATCH 34/97] modernize mass --- pkg/vere/lord.c | 31 +++++++++++++++++++++++++++++++ pkg/vere/main.c | 5 ++++- pkg/vere/serf.c | 20 ++++++++++++++++---- pkg/vere/serf.h | 2 +- 4 files changed, 52 insertions(+), 6 deletions(-) diff --git a/pkg/vere/lord.c b/pkg/vere/lord.c index a63824ae9b..da739f838d 100644 --- a/pkg/vere/lord.c +++ b/pkg/vere/lord.c @@ -521,6 +521,32 @@ _lord_plea_play(u3_lord* god_u, u3_noun dat) u3z(dat); } +/* _lord_plea_mass(): inject mass report + */ +static void +_lord_plea_mass(u3_lord* god_u, u3_noun dat) +{ + u3_noun cad = u3nc(c3__quac, dat); + u3_noun wir = u3nc(c3__quac, u3_nul); + u3_ovum* egg_u = u3_ovum_init(0, c3__k, wir, cad); + + u3_pier* pir_u = god_u->cb_u.ptr_v; + u3_auto* car_u = c3_calloc(sizeof(*car_u)); + u3_noun ovo; + + car_u->pir_u = pir_u; + car_u->nam_m = c3__quac; // is that right? I did it by analogy w/ smth + + u3_auto_plan(car_u, egg_u); + + u3_assert( u3_auto_next(car_u, &ovo) == egg_u ); + + { + struct timeval tim_tv; + gettimeofday(&tim_tv, 0); + u3_lord_work(god_u, egg_u, u3nc(u3_time_in_tv(&tim_tv), ovo)); + } +} /* _lord_work_spin(): update spinner if more work is in progress. */ static void @@ -742,6 +768,11 @@ _lord_on_plea(void* ptr_v, c3_d len_d, c3_y* byt_y) case c3__ripe: { _lord_plea_ripe(god_u, u3k(dat)); } break; + + case c3__quac: { + _lord_plea_mass(god_u, u3k(dat)); + + } break; } u3z(jar); diff --git a/pkg/vere/main.c b/pkg/vere/main.c index 2222f87dd1..012f4afdfa 100644 --- a/pkg/vere/main.c +++ b/pkg/vere/main.c @@ -1041,7 +1041,10 @@ _cw_serf_writ(void* vod_p, c3_d len_d, c3_y* byt_y) // all references must now be counted, and all roots recorded // - u3_serf_post(&u3V); + u3_weak serf_post_out = u3_serf_post(&u3V); + if (serf_post_out != u3_none) { + _cw_serf_send(u3nc(c3__quac, serf_post_out)); + } } } diff --git a/pkg/vere/serf.c b/pkg/vere/serf.c index cec8bece55..2304a926da 100644 --- a/pkg/vere/serf.c +++ b/pkg/vere/serf.c @@ -32,6 +32,7 @@ :: next steps: +$ plea $% [%live ~] [%ripe [pro=%1 hon=@ nok=@] eve=@ mug=@] + [%quac p=(unit *)] [%slog pri=@ tank] [%flog cord] $: %peek @@ -72,13 +73,15 @@ enum { /* _serf_grab(): garbage collect, checking for profiling. RETAIN. */ -static void +static u3_weak _serf_grab(u3_noun sac) { + u3_noun out = u3_none; if ( u3_nul == sac) { if ( u3C.wag_w & (u3o_debug_ram | u3o_check_corrupt) ) { u3m_grab(sac, u3_none); } + return u3_none; } else { c3_w tot_w = 0; @@ -134,6 +137,8 @@ _serf_grab(u3_noun sac) u3z(sac); u3l_log(""); + + return u3i_word(tot_w * 4); } } @@ -174,11 +179,13 @@ u3_serf_grab(void) } fprintf(stderr, "serf: measuring memory:\r\n"); - + fprintf(stderr, "BEFORE sac FORK:\r\n"); if ( u3_nul != sac ) { + printf("enter _serf_grab\r\n"); _serf_grab(sac); } else { + fprintf(stderr, "sac is empty\r\n"); u3a_print_memory(stderr, "total marked", u3m_mark(stderr)); u3a_print_memory(stderr, "free lists", u3a_idle(u3R)); u3a_print_memory(stderr, "sweep", u3a_sweep()); @@ -190,9 +197,10 @@ u3_serf_grab(void) /* u3_serf_post(): update serf state post-writ. */ -void +u3_weak u3_serf_post(u3_serf* sef_u) { + u3_noun out = u3_none; if ( sef_u->fag_w & _serf_fag_hit1 ) { if ( u3C.wag_w & u3o_verbose ) { u3l_log("serf: threshold 1: %u", u3h_wyt(u3R->cax.per_p)); @@ -213,8 +221,11 @@ u3_serf_post(u3_serf* sef_u) // XX this runs on replay too, |mass s/b elsewhere // if ( sef_u->fag_w & _serf_fag_mute ) { - _serf_grab(sef_u->sac); + u3_weak grab_mass = _serf_grab(sef_u->sac); sef_u->sac = u3_nul; + if (grab_mass != u3_none) { + out = u3nc(u3_nul, grab_mass); + } } if ( sef_u->fag_w & _serf_fag_hit0 ) { @@ -232,6 +243,7 @@ u3_serf_post(u3_serf* sef_u) } sef_u->fag_w = _serf_fag_none; + return out; } /* _serf_curb(): check for memory threshold diff --git a/pkg/vere/serf.h b/pkg/vere/serf.h index 7cd2ca47d0..bfd86e5765 100644 --- a/pkg/vere/serf.h +++ b/pkg/vere/serf.h @@ -51,7 +51,7 @@ /* u3_serf_post(): update serf state post-writ. */ - void + u3_weak u3_serf_post(u3_serf* sef_u); /* u3_serf_grab(): garbage collect. From b7c88b9eb8c300178094496f596ff7b25618426f Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Fri, 14 Jun 2024 18:22:46 -0400 Subject: [PATCH 35/97] http: streaming wip --- pkg/vere/io/http.c | 216 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 213 insertions(+), 3 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 63812a4792..5f229da1ed 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -7,6 +7,7 @@ #include "openssl/err.h" #include "openssl/ssl.h" #include "version.h" +#include typedef struct _u3_h2o_serv { h2o_globalconf_t fig_u; // h2o global config @@ -44,7 +45,8 @@ typedef struct _u3_h2o_serv { typedef struct _u3_preq { struct _u3_hreq* req_u; // originating request (nullable) struct _u3_httd* htd_u; // device backpointer - u3_noun pax; // partial scry path + u3_noun pax; // partial scry path + c3_o las_o; // was scry at now } u3_preq; /* u3_hcon: incoming http connection. @@ -638,8 +640,55 @@ _http_seq_new(u3_hcon* hon_u, h2o_req_t* rec_u) return req_u; } +/* _http_foo_cb() +*/ +static void +// _http_cache_respond(u3_hreq* req_u, u3_noun nun); +_http_scry_respond(u3_hreq* req_u, u3_noun nun); + +static void +_http_foo_cb(void* vod_p, u3_noun nun) +{ + u3_preq* peq_u = vod_p; // TODO + u3_httd* htd_u = peq_u->htd_u; + u3_hreq* req_u = peq_u->req_u; + + if ( req_u ) { + u3_assert(u3_rsat_peek == req_u->sat_e); + req_u->peq_u = 0; + _http_scry_respond(req_u, u3k(nun)); + // _http_cache_respond(req_u, u3k(nun)); + } + + u3h_put(htd_u->nax_p, peq_u->pax, nun); + u3z(peq_u->pax); + c3_free(peq_u); +} + +static c3_c* +_find_tis_fas(void* txt, size_t len) +{ + c3_c* tis = memchr(txt, '=', len); + c3_c* fas = memchr(txt, '/', len); + + if ( tis && fas ) { + return c3_min(tis, fas); + } + else if ( tis ) { + return tis; + } + else { + return fas; + } +} /* _http_req_dispatch(): dispatch http request to %eyre */ +// TODO +// [x] don't blow up on bad paths +// [x] authentication +// [ ] caching +// [ ] range header +// static void _http_req_dispatch(u3_hreq* req_u, u3_noun req) { @@ -652,10 +701,13 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) u3_noun wir = _http_req_to_duct(req_u); u3_noun cad; + c3_c* base = req_u->rec_u->input.path.base; + c3_w len = req_u->rec_u->input.path.len; + { u3_noun adr = u3nc(c3__ipv4, u3i_words(1, &req_u->hon_u->ipf_w)); // XX loopback automatically secure too? - // + // u3_noun dat = u3nt(htp_u->sec, adr, req); cad = ( c3y == req_u->hon_u->htp_u->lop ) @@ -663,8 +715,166 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) : u3nc(u3i_string("request"), dat); } - u3_auto_plan(&htd_u->car_u, u3_ovum_init(0, c3__e, wir, cad)); + if ( (len >= 5) && (base[1] == '_') && (base[2] == '~') && (base[3] == '_') && (base[4] == '/')) { + base = base + 4; // retain '/' after /_~_ + len = len - 4; + + req_u->peq_u = c3_malloc(sizeof(*req_u->peq_u)); + req_u->peq_u->req_u = req_u; + req_u->peq_u->htd_u = htd_u; + req_u->sat_e = u3_rsat_peek; + + u3_hfig* fig_u = &req_u->hon_u->htp_u->htd_u->fig_u; + h2o_req_t* rec_u = req_u->rec_u; + + u3_noun our = u3dc("scot", 'p', u3i_chubs(2, htd_u->car_u.pir_u->who_d)); + + // set gang to [~ ~] or ~ + u3_noun gang; + c3_o auth = _http_req_is_auth(fig_u, rec_u); + if ( auth == c3y ) { + gang = u3nc(u3_nul, u3_nul); + } + else { + gang = u3_nul; + } + + u3_noun who; + u3_noun des; + u3_noun cas; + c3_o last = c3n; + + size_t i; + + // get beak from path + // + for (i = 0; i < 3; ++i) { + u3_noun* where; + if ( i == 0 ) { + where = &who; + } + else if ( i == 1 ) { + where = &des; + } + else { + where = &cas; + } + + // find '//' + if ( len >= 2 && base[0] == '/' && base[1] == '/' ) { + *where = u3_nul; + base++; + len--; + } + // skip '/' + else if ( len > 0 && base[0] == '/' ) { + base++; + len--; + } + // '=' + if ( len > 0 && base[0] == '=' ) { + if ( i == 0 ) { + *where = our; + } + else if ( i == 1 ) { + *where = u3i_string("base"); + } + else { + last = c3y; + } + base++; + len--; + } + // slice cord + else { + c3_c* nex = _find_tis_fas(base, len); + if ( !nex ) { + c3_c* msg_c = "bad beam"; + h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); + return; + } + else { + c3_d mylen = nex - base; + *where = u3i_bytes(mylen, base); + base = nex; + len = len - mylen; + } + } + } + + u3_noun spur = u3dc("rush", u3i_bytes(len, (const c3_y*)base), u3v_wish("stap")); + + if ( spur == u3_nul ) { + h2o_send_error_generic(req_u->rec_u, 400, "bad spur", "bad spur", 0); + } + else { + if ( who != our ) { + c3_c* msg_c = "who != our"; + h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); + return; + } + if ( c3y == last ) { + // DON'T CACHE + u3_pier_peek_last(htd_u->car_u.pir_u, gang, c3__ex, + des, u3t(spur), req_u->peq_u, _http_foo_cb); + } + + else { + u3_noun bem = u3nq(our, des, cas, u3t(spur)); + // TODO try to serve from cache + u3_pier_peek(htd_u->car_u.pir_u, gang, u3k(u3nt(0, c3__ex, bem)), + req_u->peq_u, _http_foo_cb); + } + } + } + + else { + // inject to arvo + u3_auto_plan(&htd_u->car_u, u3_ovum_init(0, c3__e, wir, cad)); + } + } +} + +static void +_http_scry_respond(u3_hreq* req_u, u3_noun nun) { + h2o_req_t* rec_u = req_u->rec_u; + u3_httd* htd_u = req_u->hon_u->htp_u->htd_u; + + if ( u3_nul == nun ) { + u3_weak req = _http_rec_to_httq(rec_u); + if ( u3_none == req ) { + if ( (u3C.wag_w & u3o_verbose) ) { + u3l_log("strange %.*s request", (c3_i)rec_u->method.len, + rec_u->method.base); + } + c3_c* msg_c = "bad request"; + h2o_send_error_generic(rec_u, 400, msg_c, msg_c, 0); + } + else { + h2o_send_error_500(rec_u, "Internal Server Error", "scry failed", 0); + } } + else if ( u3_none == u3r_at(15, nun) ) { + h2o_send_error_500(rec_u, "Internal Server Error", "scry failed", 0); + } + else { + u3_noun auth, response_header, data; + u3x_qual(u3k(u3t(u3t(nun))), &auth, 0, &response_header, &data); + u3_noun status, headers; + u3x_cell(response_header, &status, &headers); + + // check auth + if ( (c3y == auth) + && (c3n == _http_req_is_auth(&htd_u->fig_u, rec_u)) ) + { + h2o_send_error_403(rec_u, "Unauthorized", "unauthorized", 0); + } + else { + req_u->sat_e = u3_rsat_plan; + _http_start_respond(req_u, u3k(status), u3k(headers), u3k(data), c3y); + } + } + u3z(nun); } /* _http_cache_respond(): respond with a simple-payload:http From d9d9bc872e84ca54afdd6a3e58b1515e3f74d7a1 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Mon, 24 Jun 2024 12:34:35 -0400 Subject: [PATCH 36/97] http: serve from cache --- pkg/vere/io/http.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 5f229da1ed..69fd954328 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -640,16 +640,18 @@ _http_seq_new(u3_hcon* hon_u, h2o_req_t* rec_u) return req_u; } -/* _http_foo_cb() -*/ static void -// _http_cache_respond(u3_hreq* req_u, u3_noun nun); +_http_cache_respond(u3_hreq* req_u, u3_noun nun); + +static void _http_scry_respond(u3_hreq* req_u, u3_noun nun); +/* _http_foo_cb() +*/ static void _http_foo_cb(void* vod_p, u3_noun nun) { - u3_preq* peq_u = vod_p; // TODO + u3_preq* peq_u = vod_p; u3_httd* htd_u = peq_u->htd_u; u3_hreq* req_u = peq_u->req_u; @@ -657,10 +659,11 @@ _http_foo_cb(void* vod_p, u3_noun nun) u3_assert(u3_rsat_peek == req_u->sat_e); req_u->peq_u = 0; _http_scry_respond(req_u, u3k(nun)); - // _http_cache_respond(req_u, u3k(nun)); } - u3h_put(htd_u->nax_p, peq_u->pax, nun); + if ( peq_u->las_o == c3n ) { + u3h_put(htd_u->nax_p, peq_u->pax, nun); + } u3z(peq_u->pax); c3_free(peq_u); } @@ -686,7 +689,8 @@ _find_tis_fas(void* txt, size_t len) // TODO // [x] don't blow up on bad paths // [x] authentication -// [ ] caching +// [x] caching +// [ ] insert mime in path // [ ] range header // static void @@ -821,9 +825,17 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) else { u3_noun bem = u3nq(our, des, cas, u3t(spur)); - // TODO try to serve from cache - u3_pier_peek(htd_u->car_u.pir_u, gang, u3k(u3nt(0, c3__ex, bem)), - req_u->peq_u, _http_foo_cb); + u3_weak nac = u3h_get(htd_u->nax_p, bem); + + if ( u3_none == nac ) { + req_u->peq_u->las_o = c3n; + req_u->peq_u->pax = u3k(bem); + u3_pier_peek(htd_u->car_u.pir_u, gang, u3k(u3nt(0, c3__ex, bem)), + req_u->peq_u, _http_foo_cb); + } + else { + _http_cache_respond(req_u, nac); + } } } } From fa97c3a754e17a63e8c24f1e605bb6969680504f Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Fri, 28 Jun 2024 19:47:22 -0400 Subject: [PATCH 37/97] http: cache, range WIP --- pkg/vere/io/http.c | 99 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 83 insertions(+), 16 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 69fd954328..dc3dbdc870 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -655,6 +655,7 @@ _http_foo_cb(void* vod_p, u3_noun nun) u3_httd* htd_u = peq_u->htd_u; u3_hreq* req_u = peq_u->req_u; + // TODO range if ( req_u ) { u3_assert(u3_rsat_peek == req_u->sat_e); req_u->peq_u = 0; @@ -669,7 +670,7 @@ _http_foo_cb(void* vod_p, u3_noun nun) } static c3_c* -_find_tis_fas(void* txt, size_t len) +_find_tis_fas(void* txt, c3_w len) { c3_c* tis = memchr(txt, '=', len); c3_c* fas = memchr(txt, '/', len); @@ -684,15 +685,47 @@ _find_tis_fas(void* txt, size_t len) return fas; } } -/* _http_req_dispatch(): dispatch http request to %eyre -*/ + // TODO // [x] don't blow up on bad paths // [x] authentication // [x] caching -// [ ] insert mime in path -// [ ] range header +// [x] insert mime in path +// [x] range header +// [x] u3qc_cut +// [ ] _http_range_respond? +// [ ] 216 +// [ ] better range header parsing +// [ ] better slicing +// [ ] multipart ranges // +typedef struct _range_header { + c3_o ok; + c3_w begin; + c3_w end; +} range_header; + +static range_header +_get_range(void* txt, c3_w len) +{ + c3_c* hep = memchr(txt, '-', len); + range_header slice; + + if ( hep ) { + // XX sscanf safety - u3i_bytes first? + sscanf(txt, "%" SCNu32, &slice.begin); + sscanf(hep+1, "%" SCNu32, &slice.end); + slice.ok = c3y; + return slice; + } + else { + slice.ok = c3n; + return slice; + } +} + +/* _http_req_dispatch(): dispatch http request to %eyre +*/ static void _http_req_dispatch(u3_hreq* req_u, u3_noun req) { @@ -748,7 +781,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) u3_noun cas; c3_o last = c3n; - size_t i; + c3_w i; // get beak from path // @@ -808,23 +841,23 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) u3_noun spur = u3dc("rush", u3i_bytes(len, (const c3_y*)base), u3v_wish("stap")); - if ( spur == u3_nul ) { - h2o_send_error_generic(req_u->rec_u, 400, "bad spur", "bad spur", 0); + if ( (who != our) || (spur == u3_nul) ) { + c3_c* msg_c = "bad scry path"; + h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); + return; } + else { - if ( who != our ) { - c3_c* msg_c = "who != our"; - h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); - return; - } + spur = u3nc(u3i_string("mime"), u3t(spur)); if ( c3y == last ) { // DON'T CACHE + req_u->peq_u->las_o = c3y; u3_pier_peek_last(htd_u->car_u.pir_u, gang, c3__ex, - des, u3t(spur), req_u->peq_u, _http_foo_cb); + des, spur, req_u->peq_u, _http_foo_cb); } else { - u3_noun bem = u3nq(our, des, cas, u3t(spur)); + u3_noun bem = u3nq(our, des, cas, spur); u3_weak nac = u3h_get(htd_u->nax_p, bem); if ( u3_none == nac ) { @@ -834,7 +867,41 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) req_u->peq_u, _http_foo_cb); } else { - _http_cache_respond(req_u, nac); + // TODO range + h2o_headers_t req_headers = req_u->rec_u->headers; + c3_w idx = h2o_find_header(&req_headers, H2O_TOKEN_RANGE, -1); + + if (idx != UINT32_MAX) { + if ( (req_headers.entries[idx].value.len > 6) && + (memcmp("bytes=", req_headers.entries[idx].value.base, 6) == 0 )) { + c3_w rest_len = req_headers.entries[idx].value.len - 6; + range_header res = _get_range(req_headers.entries[idx].value.base + 6, rest_len); + u3l_log("struct begin %u, end %u", res.begin, res.end); + if ( res.ok == c3n ) { + c3_c* msg_c = "Requested Range Not Satisfiable"; + h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); + } + else { + u3_noun octs = u3r_at(127, nac); + + if ( octs == u3_none) { + c3_c* msg_c = "Requested Range Not Satisfiable"; + h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); + return; + } + + // TODO check range vs size + u3_noun piece = u3qc_cut(3, res.begin, res.end + 1, u3t(octs)); + u3_noun result = u3nc(res.end + 1 - res.begin, piece); + u3m_p("result", result); + u3_noun res = u3i_edit(nac, 127, result); + _http_cache_respond(req_u, res); + } + } + } + else { + _http_cache_respond(req_u, nac); + } } } } From 466c7699d2bdc239b7a3de58b60685b83fa1d8bf Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Mon, 1 Jul 2024 12:46:22 -0400 Subject: [PATCH 38/97] http: better header parsing --- pkg/vere/io/http.c | 77 +++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index dc3dbdc870..c1008048a9 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -7,7 +7,9 @@ #include "openssl/err.h" #include "openssl/ssl.h" #include "version.h" +#include #include +#include typedef struct _u3_h2o_serv { h2o_globalconf_t fig_u; // h2o global config @@ -693,35 +695,42 @@ _find_tis_fas(void* txt, c3_w len) // [x] insert mime in path // [x] range header // [x] u3qc_cut -// [ ] _http_range_respond? -// [ ] 216 -// [ ] better range header parsing +// [x] better range header parsing // [ ] better slicing +// [ ] 216 +// [ ] _http_range_respond? // [ ] multipart ranges // typedef struct _range_header { - c3_o ok; - c3_w begin; - c3_w end; + c3_z start_z; + c3_z end_z; } range_header; static range_header -_get_range(void* txt, c3_w len) +_get_range(c3_c* txt_c, c3_w len_w) { - c3_c* hep = memchr(txt, '-', len); + c3_c* hep_c = memchr(txt_c, '-', len_w); + c3_c* txt_two_c = txt_c; range_header slice; - - if ( hep ) { - // XX sscanf safety - u3i_bytes first? - sscanf(txt, "%" SCNu32, &slice.begin); - sscanf(hep+1, "%" SCNu32, &slice.end); - slice.ok = c3y; - return slice; - } - else { - slice.ok = c3n; - return slice; + slice.start_z = SIZE_MAX; + slice.end_z = SIZE_MAX; + + // - + // - + // -, - + // -, -, - + // - + + if ( hep_c ) { + slice.start_z = h2o_strtosizefwd(&txt_two_c, hep_c - txt_c); + if ( slice.start_z != SIZE_MAX ) { + u3_assert(txt_two_c == hep_c); + } + txt_two_c = hep_c + 1; + slice.end_z = h2o_strtosizefwd(&txt_two_c, len_w - (txt_two_c - txt_c)); } + + return slice; } /* _http_req_dispatch(): dispatch http request to %eyre @@ -875,28 +884,20 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) if ( (req_headers.entries[idx].value.len > 6) && (memcmp("bytes=", req_headers.entries[idx].value.base, 6) == 0 )) { c3_w rest_len = req_headers.entries[idx].value.len - 6; - range_header res = _get_range(req_headers.entries[idx].value.base + 6, rest_len); - u3l_log("struct begin %u, end %u", res.begin, res.end); - if ( res.ok == c3n ) { + range_header rng_hed = _get_range(req_headers.entries[idx].value.base + 6, rest_len); + u3l_log("struct start %lu, end %lu", rng_hed.start_z, rng_hed.end_z); + u3_noun octs = u3r_at(127, nac); + if ( octs == u3_none) { c3_c* msg_c = "Requested Range Not Satisfiable"; h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); + return; } - else { - u3_noun octs = u3r_at(127, nac); - - if ( octs == u3_none) { - c3_c* msg_c = "Requested Range Not Satisfiable"; - h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); - return; - } - - // TODO check range vs size - u3_noun piece = u3qc_cut(3, res.begin, res.end + 1, u3t(octs)); - u3_noun result = u3nc(res.end + 1 - res.begin, piece); - u3m_p("result", result); - u3_noun res = u3i_edit(nac, 127, result); - _http_cache_respond(req_u, res); - } + // TODO check range vs size + u3_noun piece = u3qc_cut(3, rng_hed.start_z, rng_hed.end_z + 1, u3t(octs)); + u3_noun result = u3nc(rng_hed.end_z + 1 - rng_hed.start_z, piece); + u3m_p("result", result); + u3_noun res = u3i_edit(nac, 127, result); + _http_cache_respond(req_u, res); } } else { From 05a5db9f88de0764e03f02fe0d4c4d75e41d30eb Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Tue, 2 Jul 2024 12:25:30 -0400 Subject: [PATCH 39/97] http: better mime slicing, header parsing --- pkg/vere/io/http.c | 85 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 22 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index c1008048a9..7b5d27590d 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -696,8 +696,9 @@ _find_tis_fas(void* txt, c3_w len) // [x] range header // [x] u3qc_cut // [x] better range header parsing -// [ ] better slicing -// [ ] 216 +// [x] better slicing +// [ ] 206 +// [ ] 200 // [ ] _http_range_respond? // [ ] multipart ranges // @@ -706,28 +707,62 @@ typedef struct _range_header { c3_z end_z; } range_header; +static u3_noun +_slice_mime(range_header rng, u3_noun octs) +{ + c3_w lent_w = u3h(octs); + c3_w oct_w = u3t(octs); + u3_noun out; + + if ( rng.start_z == SIZE_MAX ) { + if ( rng.end_z == SIZE_MAX) { + // [~ ~] + out = u3_nul; + } + else { + // [~ @] + if ( rng.end_z > lent_w ) { + out = u3_nul; + } + else { + // slice last bytes + out = u3nc( rng.end_z, + u3qc_cut(3, lent_w - rng.end_z, rng.end_z, oct_w)); + } + } + } + else if ( rng.end_z == SIZE_MAX ) { + // [@ ~] + if ( rng.start_z > lent_w ) { + out = u3_nul; + } + else { + out = u3nc( lent_w - rng.start_z, + u3qc_cut(3, rng.start_z, lent_w, oct_w)); + } + } + else if (rng.end_z > lent_w) { + out = u3_nul; + } + else { + // [@ @] + out = u3nc( (rng.end_z - rng.start_z) + 1, + u3qc_cut(3, rng.start_z, (rng.end_z - rng.start_z) + 1, oct_w)); + } + return out; +} + static range_header _get_range(c3_c* txt_c, c3_w len_w) { c3_c* hep_c = memchr(txt_c, '-', len_w); - c3_c* txt_two_c = txt_c; range_header slice; slice.start_z = SIZE_MAX; slice.end_z = SIZE_MAX; - // - - // - - // -, - - // -, -, - - // - - if ( hep_c ) { - slice.start_z = h2o_strtosizefwd(&txt_two_c, hep_c - txt_c); - if ( slice.start_z != SIZE_MAX ) { - u3_assert(txt_two_c == hep_c); - } - txt_two_c = hep_c + 1; - slice.end_z = h2o_strtosizefwd(&txt_two_c, len_w - (txt_two_c - txt_c)); + slice.start_z = h2o_strtosize(txt_c, hep_c - txt_c); + slice.end_z = h2o_strtosize(hep_c + 1, len_w - ((hep_c + 1) - txt_c)); } return slice; @@ -876,26 +911,32 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) req_u->peq_u, _http_foo_cb); } else { - // TODO range h2o_headers_t req_headers = req_u->rec_u->headers; c3_w idx = h2o_find_header(&req_headers, H2O_TOKEN_RANGE, -1); if (idx != UINT32_MAX) { - if ( (req_headers.entries[idx].value.len > 6) && + if ( (req_headers.entries[idx].value.len >= 6) && (memcmp("bytes=", req_headers.entries[idx].value.base, 6) == 0 )) { c3_w rest_len = req_headers.entries[idx].value.len - 6; + if ( rest_len == 0) { + c3_c* msg_c = "Requested Range Not Satisfiable"; + h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); + return; + } range_header rng_hed = _get_range(req_headers.entries[idx].value.base + 6, rest_len); - u3l_log("struct start %lu, end %lu", rng_hed.start_z, rng_hed.end_z); u3_noun octs = u3r_at(127, nac); if ( octs == u3_none) { c3_c* msg_c = "Requested Range Not Satisfiable"; h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); return; } - // TODO check range vs size - u3_noun piece = u3qc_cut(3, rng_hed.start_z, rng_hed.end_z + 1, u3t(octs)); - u3_noun result = u3nc(rng_hed.end_z + 1 - rng_hed.start_z, piece); - u3m_p("result", result); + u3_noun result = _slice_mime(rng_hed, octs); + + if ( result == u3_nul) { + c3_c* msg_c = "Requested Range Not Satisfiable"; + h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); + return; + } u3_noun res = u3i_edit(nac, 127, result); _http_cache_respond(req_u, res); } From b0e3f3f0b492aa28bb62dd034720f83a6728bb58 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Wed, 3 Jul 2024 09:50:44 -0400 Subject: [PATCH 40/97] http: return 200, 206, clean up range header edge cases --- pkg/vere/io/http.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 7b5d27590d..f4dbb4182e 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -697,9 +697,10 @@ _find_tis_fas(void* txt, c3_w len) // [x] u3qc_cut // [x] better range header parsing // [x] better slicing -// [ ] 206 -// [ ] 200 -// [ ] _http_range_respond? +// [x] 206 +// [x] 200 +// [x] open range vs error +// [ ] test stream // [ ] multipart ranges // typedef struct _range_header { @@ -763,6 +764,12 @@ _get_range(c3_c* txt_c, c3_w len_w) if ( hep_c ) { slice.start_z = h2o_strtosize(txt_c, hep_c - txt_c); slice.end_z = h2o_strtosize(hep_c + 1, len_w - ((hep_c + 1) - txt_c)); + // strange -> [SIZE_MAX SIZE_MAX] so we return u3_nul in _slice_mime + if ( ((hep_c != txt_c) && slice.start_z == SIZE_MAX ) || + ( len_w - ((hep_c + 1) - txt_c) > 0 ) && slice.end_z == SIZE_MAX ) { + slice.start_z = SIZE_MAX; + slice.end_z = SIZE_MAX; + } } return slice; @@ -937,8 +944,16 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); return; } - u3_noun res = u3i_edit(nac, 127, result); - _http_cache_respond(req_u, res); + if ( u3r_sing(result, octs) == c3y) { + // 200 + _http_cache_respond(req_u, nac); + } + else { + // 206 + u3_noun tmp = u3i_edit(nac, 124, 206); + u3_noun res = u3i_edit(tmp, 127, result); + _http_cache_respond(req_u, res); + } } } else { From 6f561afb5b4ed4410a713f769c7293a512dabdfb Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Mon, 8 Jul 2024 14:38:09 -0400 Subject: [PATCH 41/97] http: add content-range header --- pkg/vere/io/http.c | 82 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 72 insertions(+), 10 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index f4dbb4182e..577a23db79 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -700,8 +700,10 @@ _find_tis_fas(void* txt, c3_w len) // [x] 206 // [x] 200 // [x] open range vs error -// [ ] test stream -// [ ] multipart ranges +// [x] test stream +// [x] content range function +// [ ] video controls +// [ ] multipart ranges? // typedef struct _range_header { c3_z start_z; @@ -774,6 +776,64 @@ _get_range(c3_c* txt_c, c3_w len_w) return slice; } +static u3_noun +_content_rng(range_header rng, c3_w lent_w) +{ + u3_noun out; + u3_atom start; + u3_atom end; + + if ( rng.start_z == SIZE_MAX ) { + if ( rng.end_z == SIZE_MAX) { + // [~ ~] + out = u3_nul; + } + else { + // [~ @] + if ( rng.end_z > lent_w ) { + out = u3_nul; + return out; + } + else { + // last bytes + start = lent_w - rng.end_z; + end = rng.end_z; + } + } + } + else if ( rng.end_z == SIZE_MAX ) { + // [@ ~] + if ( rng.start_z > lent_w ) { + out = u3_nul; + return out; + } + else { + start = rng.start_z; + end = lent_w; + } + } + else if (rng.end_z > lent_w) { + out = u3_nul; + return out; + } + else { + // [@ @] + start = rng.start_z; + end = rng.end_z; + } + + u3_noun lin = u3i_list(u3i_string("bytes "), + u3do("crip", u3do("a-co:co", start)), + c3_s1('-'), + u3do("crip", u3do("a-co:co", end)), + c3_s1('/'), + // XX ++? + u3do("crip", u3do("a-co:co", lent_w)), + u3_none); + u3_atom dat = u3qc_rap(3, lin); + out = u3nc(u3i_string("Content-Range"), dat); + return out; +} /* _http_req_dispatch(): dispatch http request to %eyre */ @@ -944,16 +1004,18 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); return; } - if ( u3r_sing(result, octs) == c3y) { - // 200 - _http_cache_respond(req_u, nac); - } - else { + // if ( u3r_sing(result, octs) == c3y) { + // // 200 + // _http_cache_respond(req_u, nac); + // } + // else { // 206 - u3_noun tmp = u3i_edit(nac, 124, 206); - u3_noun res = u3i_edit(tmp, 127, result); + u3_noun con_rng_hed = _content_rng(rng_hed, u3h(octs)); + u3_noun res = u3i_edit(nac, 127, result); + res = u3i_edit(res, 124, 206); + res = u3i_edit(res, 125, u3nc(con_rng_hed, u3r_at(125, res))); _http_cache_respond(req_u, res); - } + // } } } else { From 59adc332a6a88f0f73c86d8442cb8de397ad879d Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Mon, 8 Jul 2024 23:32:07 -0400 Subject: [PATCH 42/97] http: fix content-range size --- pkg/vere/io/http.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 577a23db79..3262a899c5 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -702,7 +702,7 @@ _find_tis_fas(void* txt, c3_w len) // [x] open range vs error // [x] test stream // [x] content range function -// [ ] video controls +// [x] video controls // [ ] multipart ranges? // typedef struct _range_header { @@ -827,8 +827,7 @@ _content_rng(range_header rng, c3_w lent_w) c3_s1('-'), u3do("crip", u3do("a-co:co", end)), c3_s1('/'), - // XX ++? - u3do("crip", u3do("a-co:co", lent_w)), + u3do("crip", u3do("a-co:co", ++lent_w )), u3_none); u3_atom dat = u3qc_rap(3, lin); out = u3nc(u3i_string("Content-Range"), dat); From 695d3684023f7f0b0ae9430407f2cbe8db3fc9e5 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Tue, 9 Jul 2024 18:39:18 -0400 Subject: [PATCH 43/97] http: consolidate slicing & content-range logic --- pkg/vere/io/http.c | 111 ++++++++++++++++++--------------------------- 1 file changed, 44 insertions(+), 67 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 3262a899c5..f22b86679e 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -7,9 +7,6 @@ #include "openssl/err.h" #include "openssl/ssl.h" #include "version.h" -#include -#include -#include typedef struct _u3_h2o_serv { h2o_globalconf_t fig_u; // h2o global config @@ -703,6 +700,7 @@ _find_tis_fas(void* txt, c3_w len) // [x] test stream // [x] content range function // [x] video controls +// [ ] fix repeated 1 byte requests // [ ] multipart ranges? // typedef struct _range_header { @@ -710,47 +708,67 @@ typedef struct _range_header { c3_z end_z; } range_header; -static u3_noun +typedef struct _content { + c3_z start_z; + c3_z end_z; + u3_noun dat; +} content; + +static content _slice_mime(range_header rng, u3_noun octs) { c3_w lent_w = u3h(octs); c3_w oct_w = u3t(octs); - u3_noun out; + content out; if ( rng.start_z == SIZE_MAX ) { - if ( rng.end_z == SIZE_MAX) { + if ( rng.end_z == SIZE_MAX ) { // [~ ~] - out = u3_nul; + out.start_z = SIZE_MAX; + out.end_z = SIZE_MAX; + out.dat = u3_nul; } else { // [~ @] if ( rng.end_z > lent_w ) { - out = u3_nul; + out.start_z = SIZE_MAX; + out.end_z = SIZE_MAX; + out.dat = u3_nul; } else { // slice last bytes - out = u3nc( rng.end_z, - u3qc_cut(3, lent_w - rng.end_z, rng.end_z, oct_w)); + out.start_z = lent_w - rng.end_z; + out.end_z = rng.end_z; + out.dat = u3nc(rng.end_z, + u3qc_cut(3, out.start_z, out.end_z, oct_w)); } } } else if ( rng.end_z == SIZE_MAX ) { // [@ ~] if ( rng.start_z > lent_w ) { - out = u3_nul; + out.start_z = SIZE_MAX; + out.end_z = SIZE_MAX; + out.dat = u3_nul; } else { - out = u3nc( lent_w - rng.start_z, - u3qc_cut(3, rng.start_z, lent_w, oct_w)); + out.start_z = rng.start_z; + out.end_z = lent_w; + out.dat = u3nc(lent_w - rng.start_z, + u3qc_cut(3, rng.start_z, rng.end_z, oct_w)); } } else if (rng.end_z > lent_w) { - out = u3_nul; + out.start_z = SIZE_MAX; + out.end_z = SIZE_MAX; + out.dat = u3_nul; } else { // [@ @] - out = u3nc( (rng.end_z - rng.start_z) + 1, - u3qc_cut(3, rng.start_z, (rng.end_z - rng.start_z) + 1, oct_w)); + out.start_z = rng.start_z; + out.end_z = (rng.end_z - rng.start_z) + 1; + out.dat = u3nc((rng.end_z - rng.start_z) + 1, + u3qc_cut(3, out.start_z, out.end_z, oct_w)); } return out; } @@ -767,65 +785,24 @@ _get_range(c3_c* txt_c, c3_w len_w) slice.start_z = h2o_strtosize(txt_c, hep_c - txt_c); slice.end_z = h2o_strtosize(hep_c + 1, len_w - ((hep_c + 1) - txt_c)); // strange -> [SIZE_MAX SIZE_MAX] so we return u3_nul in _slice_mime - if ( ((hep_c != txt_c) && slice.start_z == SIZE_MAX ) || - ( len_w - ((hep_c + 1) - txt_c) > 0 ) && slice.end_z == SIZE_MAX ) { + if ( ((hep_c != txt_c) && (slice.start_z == SIZE_MAX)) || + ((len_w - ((hep_c + 1) - txt_c) > 0) && (slice.end_z == SIZE_MAX)) ) { slice.start_z = SIZE_MAX; slice.end_z = SIZE_MAX; } } - return slice; } + static u3_noun -_content_rng(range_header rng, c3_w lent_w) +_content_rng(c3_z start_z, c3_z end_z, c3_w lent_w) { u3_noun out; - u3_atom start; - u3_atom end; - - if ( rng.start_z == SIZE_MAX ) { - if ( rng.end_z == SIZE_MAX) { - // [~ ~] - out = u3_nul; - } - else { - // [~ @] - if ( rng.end_z > lent_w ) { - out = u3_nul; - return out; - } - else { - // last bytes - start = lent_w - rng.end_z; - end = rng.end_z; - } - } - } - else if ( rng.end_z == SIZE_MAX ) { - // [@ ~] - if ( rng.start_z > lent_w ) { - out = u3_nul; - return out; - } - else { - start = rng.start_z; - end = lent_w; - } - } - else if (rng.end_z > lent_w) { - out = u3_nul; - return out; - } - else { - // [@ @] - start = rng.start_z; - end = rng.end_z; - } u3_noun lin = u3i_list(u3i_string("bytes "), - u3do("crip", u3do("a-co:co", start)), + u3do("crip", u3do("a-co:co", start_z)), c3_s1('-'), - u3do("crip", u3do("a-co:co", end)), + u3do("crip", u3do("a-co:co", end_z)), c3_s1('/'), u3do("crip", u3do("a-co:co", ++lent_w )), u3_none); @@ -996,9 +973,9 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); return; } - u3_noun result = _slice_mime(rng_hed, octs); + content result = _slice_mime(rng_hed, octs); - if ( result == u3_nul) { + if ( result.dat == u3_nul) { c3_c* msg_c = "Requested Range Not Satisfiable"; h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); return; @@ -1009,8 +986,8 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) // } // else { // 206 - u3_noun con_rng_hed = _content_rng(rng_hed, u3h(octs)); - u3_noun res = u3i_edit(nac, 127, result); + u3_noun con_rng_hed = _content_rng(result.start_z, result.end_z, u3h(octs)); + u3_noun res = u3i_edit(nac, 127, result.dat); res = u3i_edit(res, 124, 206); res = u3i_edit(res, 125, u3nc(con_rng_hed, u3r_at(125, res))); _http_cache_respond(req_u, res); From 11439ebbb0f64f87ce6801b52ce233c34c67c1dd Mon Sep 17 00:00:00 2001 From: Tinnus Napbus Date: Fri, 31 May 2024 21:30:46 +1200 Subject: [PATCH 44/97] mass: return memory report to arvo --- pkg/noun/allocate.c | 186 +++++++++++++++++++++++++++++++++----------- pkg/noun/allocate.h | 19 +++-- pkg/noun/jets.c | 60 ++++++++++---- pkg/noun/jets.h | 4 +- pkg/noun/manage.c | 19 ++--- pkg/noun/manage.h | 14 +++- pkg/noun/nock.c | 27 +++++-- pkg/noun/nock.h | 4 +- pkg/noun/vortex.c | 30 +++++-- pkg/noun/vortex.h | 4 +- pkg/vere/io/term.c | 24 ++++++ pkg/vere/king.c | 30 +++++-- pkg/vere/lord.c | 61 ++++++--------- pkg/vere/main.c | 8 +- pkg/vere/serf.c | 172 ++++++++++++++++++++++++++++++---------- pkg/vere/serf.h | 10 ++- pkg/vere/vere.h | 13 +++- 17 files changed, 501 insertions(+), 184 deletions(-) diff --git a/pkg/noun/allocate.c b/pkg/noun/allocate.c index 8ca5085397..620d5fac5c 100644 --- a/pkg/noun/allocate.c +++ b/pkg/noun/allocate.c @@ -9,6 +9,8 @@ #include "retrieve.h" #include "trace.h" #include "vortex.h" +#include "noun.h" +#include "defs.h" u3_road* u3a_Road; @@ -2005,9 +2007,8 @@ u3a_maid(FILE* fil_u, c3_c* cap_c, c3_w wor_w) /* _ca_print_memory(): un-captioned u3a_print_memory(). */ static void -_ca_print_memory(FILE* fil_u, c3_w wor_w) +_ca_print_memory(FILE* fil_u, c3_w byt_w) { - c3_w byt_w = (wor_w * 4); c3_w gib_w = (byt_w / 1000000000); c3_w mib_w = (byt_w % 1000000000) / 1000000; c3_w kib_w = (byt_w % 1000000) / 1000; @@ -2028,43 +2029,54 @@ _ca_print_memory(FILE* fil_u, c3_w wor_w) } } +/* u3a_quac_free: free quac memory. +*/ +void +u3a_quac_free(quac* qua_u) +{ + for ( c3_w i_w = 0; i_w < qua_u->len_w; i_w++ ) { + u3a_quac_free(qua_u->qua_u[i_w]); + } + c3_free(qua_u->nam_c); + c3_free(qua_u->qua_u); + c3_free(qua_u); +} + /* u3a_prof(): mark/measure/print memory profile. RETAIN. */ -c3_w -u3a_prof(FILE* fil_u, c3_w den_w, u3_noun mas) +quac* +u3a_prof(FILE* fil_u, u3_noun mas) { - c3_w tot_w = 0; +// c3_w tot_w = 0; + quac* pro_u = c3_calloc(sizeof(*pro_u)); u3_noun h_mas, t_mas; if ( c3n == u3r_cell(mas, &h_mas, &t_mas) ) { - fprintf(fil_u, "%.*smistyped mass\r\n", den_w, ""); - return tot_w; + fprintf(fil_u, "mistyped mass\r\n"); + c3_free(pro_u); + return NULL; } - else if ( _(u3du(h_mas)) ) { - fprintf(fil_u, "%.*smistyped mass head\r\n", den_w, ""); + else if ( c3y == u3du(h_mas) ) { + fprintf(fil_u, "mistyped mass head\r\n"); { c3_c* lab_c = u3m_pretty(h_mas); fprintf(fil_u, "h_mas: %s", lab_c); c3_free(lab_c); } - return tot_w; + c3_free(pro_u); + return NULL; } else { - { - c3_c* lab_c = u3m_pretty(h_mas); - fprintf(fil_u, "%*s%s: ", den_w, "", lab_c); - c3_free(lab_c); - } u3_noun it_mas, tt_mas; if ( c3n == u3r_cell(t_mas, &it_mas, &tt_mas) ) { - fprintf(fil_u, "%*smistyped mass tail\r\n", den_w, ""); - return tot_w; + fprintf(fil_u, "mistyped mass tail\r\n"); + c3_free(pro_u); + return NULL; } else if ( c3y == it_mas ) { - tot_w += u3a_mark_noun(tt_mas); - _ca_print_memory(fil_u, tot_w); + c3_w siz_w = u3a_mark_noun(tt_mas); #if 1 /* The basic issue here is that tt_mas is included in .sac @@ -2075,7 +2087,7 @@ u3a_prof(FILE* fil_u, c3_w den_w, u3_noun mas) * * see u3a_mark_ptr(). */ - if ( _(u3a_is_dog(tt_mas)) ) { + if ( c3y == u3a_is_dog(tt_mas) ) { u3a_box* box_u = u3a_botox(u3a_to_ptr(tt_mas)); #ifdef U3_MEMORY_DEBUG if ( 1 == box_u->eus_w ) { @@ -2094,45 +2106,131 @@ u3a_prof(FILE* fil_u, c3_w den_w, u3_noun mas) #endif } #endif + pro_u->nam_c = u3r_string(h_mas); + pro_u->siz_w = siz_w*4; + pro_u->len_w = 0; + pro_u->qua_u = 0; + return pro_u; - return tot_w; } else if ( c3n == it_mas ) { fprintf(fil_u, "\r\n"); - - while ( _(u3du(tt_mas)) ) { - tot_w += u3a_prof(fil_u, den_w+2, u3h(tt_mas)); + pro_u->qua_u = c3_malloc(sizeof(pro_u->qua_u)); + pro_u->len_w = 0; + c3_w i_w = 2; + c3_t bad_t = 0; + while ( c3y == u3du(tt_mas) ) { + quac* new_u = u3a_prof(fil_u, u3h(tt_mas)); + if ( NULL == new_u ) { + c3_free(new_u); + bad_t = 1; + } else { + pro_u->qua_u = c3_realloc(pro_u->qua_u, i_w*sizeof(pro_u->qua_u)); + pro_u->siz_w += new_u->siz_w; + pro_u->qua_u[pro_u->len_w] = new_u; + pro_u->len_w++; + } tt_mas = u3t(tt_mas); + i_w++; + } + + if ( bad_t ) { + for ( i_w = 0; i_w < pro_u->len_w ; i_w++ ) { + u3a_quac_free(pro_u->qua_u[i_w]); + } + c3_free(pro_u->qua_u); + c3_free(pro_u); + return NULL; + } else { + pro_u->nam_c = u3r_string(h_mas); + return pro_u; } + } + else { + fprintf(fil_u, "mistyped (strange) mass tail\r\n"); + c3_free(pro_u); + return NULL; + } + } +} - fprintf(fil_u, "%*s--", den_w, ""); - _ca_print_memory(fil_u, tot_w); - return tot_w; +/* u3a_print_quac: print a quac memory report. +*/ - } - else { - fprintf(fil_u, "%*smistyped (strange) mass tail\r\n", den_w, ""); - return tot_w; +void +u3a_print_quac(FILE* fil_u, c3_w den_w, quac* mas_u) +{ + u3_assert( 0 != fil_u ); + + if ( mas_u->siz_w ) { + fprintf(fil_u, "%*s%s: ", den_w, "", mas_u->nam_c); + + if ( mas_u->len_w == 0) { + _ca_print_memory(fil_u, mas_u->siz_w); + } else { + fprintf(fil_u, "\r\n"); + c3_w i_w; + for ( i_w = 0; i_w < mas_u->len_w; i_w++ ) { + u3a_print_quac(fil_u, den_w+2, mas_u->qua_u[i_w]); + } + fprintf(fil_u, "%*s--", den_w, ""); + _ca_print_memory(fil_u, mas_u->siz_w); } } } /* u3a_mark_road(): mark ad-hoc persistent road structures. */ -c3_w -u3a_mark_road(FILE* fil_u) -{ - c3_w tot_w = 0; - tot_w += u3a_maid(fil_u, " namespace", u3a_mark_noun(u3R->ski.gul)); - tot_w += u3a_maid(fil_u, " trace stack", u3a_mark_noun(u3R->bug.tax)); - tot_w += u3a_maid(fil_u, " trace buffer", u3a_mark_noun(u3R->bug.mer)); - tot_w += u3a_maid(fil_u, " profile batteries", u3a_mark_noun(u3R->pro.don)); - tot_w += u3a_maid(fil_u, " profile doss", u3a_mark_noun(u3R->pro.day)); - tot_w += u3a_maid(fil_u, " new profile trace", u3a_mark_noun(u3R->pro.trace)); - tot_w += u3a_maid(fil_u, " transient memoization cache", u3h_mark(u3R->cax.har_p)); - tot_w += u3a_maid(fil_u, " persistent memoization cache", u3h_mark(u3R->cax.per_p)); - return u3a_maid(fil_u, "total road stuff", tot_w); +quac* +u3a_mark_road() +{ + quac** qua_u = c3_malloc(sizeof(*qua_u)*8); + + qua_u[0] = c3_calloc(sizeof(*qua_u[0])); + qua_u[0]->nam_c = strdup("namespace"); + qua_u[0]->siz_w = u3a_mark_noun(u3R->ski.gul)*4; + + qua_u[1] = c3_calloc(sizeof(*qua_u[1])); + qua_u[1]->nam_c = strdup("trace stack"); + qua_u[1]->siz_w = u3a_mark_noun(u3R->ski.gul)*4; + + qua_u[2] = c3_calloc(sizeof(*qua_u[2])); + qua_u[2]->nam_c = strdup("trace buffer"); + qua_u[2]->siz_w = u3a_mark_noun(u3R->bug.mer)*4; + + qua_u[3] = c3_calloc(sizeof(*qua_u[3])); + qua_u[3]->nam_c = strdup("profile batteries"); + qua_u[3]->siz_w = u3a_mark_noun(u3R->pro.don)*4; + + qua_u[4] = c3_calloc(sizeof(*qua_u[4])); + qua_u[4]->nam_c = strdup("profile doss"); + qua_u[4]->siz_w = u3a_mark_noun(u3R->pro.day)*4; + + qua_u[5] = c3_calloc(sizeof(*qua_u[5])); + qua_u[5]->nam_c = strdup("new profile trace"); + qua_u[5]->siz_w = u3a_mark_noun(u3R->pro.trace)*4; + + qua_u[6] = c3_calloc(sizeof(*qua_u[6])); + qua_u[6]->nam_c = strdup("transient memoization cache"); + qua_u[6]->siz_w = u3h_mark(u3R->cax.har_p)*4; + + qua_u[7] = c3_calloc(sizeof(*qua_u[7])); + qua_u[7]->nam_c = strdup("persistent memoization cache"); + qua_u[7]->siz_w = u3h_mark(u3R->cax.per_p)*4; + + c3_w sum_w = 0; + for (c3_w i_w = 0; i_w < 8; i_w++) { + sum_w += qua_u[i_w]->siz_w; + } + + quac* tot_u = c3_malloc(sizeof(*tot_u)); + tot_u->nam_c = strdup("total road stuff"); + tot_u->siz_w = sum_w; + tot_u->len_w = 8; + tot_u->qua_u = qua_u; + + return tot_u; } /* u3a_reclaim(): clear ad-hoc persistent caches to reclaim memory. diff --git a/pkg/noun/allocate.h b/pkg/noun/allocate.h index b851b64c98..7763fd3183 100644 --- a/pkg/noun/allocate.h +++ b/pkg/noun/allocate.h @@ -606,8 +606,8 @@ /* u3a_mark_road(): mark ad-hoc persistent road structures. */ - c3_w - u3a_mark_road(FILE* fil_u); + quac* + u3a_mark_road(); /* u3a_reclaim(): clear ad-hoc persistent caches to reclaim memory. */ @@ -702,21 +702,30 @@ void u3a_print_time(c3_c* str_c, c3_c* cap_c, c3_d mic_d); + /* u3a_print_quac: print a quac memory report. + */ + void + u3a_print_quac(FILE* fil_u, c3_w den_w, quac* mas_u); + /* u3a_print_memory(): print memory amount. */ void u3a_print_memory(FILE* fil_u, c3_c* cap_c, c3_w wor_w); - /* u3a_prof(): mark/measure/print memory profile. RETAIN. */ - c3_w - u3a_prof(FILE* fil_u, c3_w den_w, u3_noun mas); + quac* + u3a_prof(FILE* fil_u, u3_noun mas); /* u3a_maid(): maybe print memory. */ c3_w u3a_maid(FILE* fil_u, c3_c* cap_c, c3_w wor_w); + /* u3a_uncap_print_memory(): un-captioned print memory amount. + */ + void + u3a_uncap_print_memory(FILE* fil_u, c3_w byt_w); + /* u3a_deadbeef(): write 0xdeadbeef from hat to cap. */ void diff --git a/pkg/noun/jets.c b/pkg/noun/jets.c index 45e7e8144a..e64ce26c8a 100644 --- a/pkg/noun/jets.c +++ b/pkg/noun/jets.c @@ -2305,27 +2305,59 @@ _cj_mark_hank(u3_noun kev, void* dat) /* u3j_mark(): mark jet state for gc. */ -c3_w -u3j_mark(FILE* fil_u) +quac* +u3j_mark() { - c3_w tot_w = 0; + quac** qua_u = c3_malloc(sizeof(*qua_u)*6); - tot_w += u3a_maid(fil_u, " warm jet state", u3h_mark(u3R->jed.war_p)); - tot_w += u3a_maid(fil_u, " cold jet state", u3h_mark(u3R->jed.cod_p)); - tot_w += u3a_maid(fil_u, " hank cache", u3h_mark(u3R->jed.han_p)); - tot_w += u3a_maid(fil_u, " battery hash cache", u3h_mark(u3R->jed.bas_p)); + qua_u[0] = c3_calloc(sizeof(*qua_u[0])); + qua_u[0]->nam_c = strdup("warm jet state"); + qua_u[0]->siz_w = u3h_mark(u3R->jed.war_p)*4; - { - c3_w han_w = 0; - u3h_walk_with(u3R->jed.han_p, _cj_mark_hank, &han_w); - tot_w += u3a_maid(fil_u, " call site cache", han_w); + qua_u[1] = c3_calloc(sizeof(*qua_u[1])); + qua_u[1]->nam_c = strdup("cold jet state"); + qua_u[1]->siz_w = u3h_mark(u3R->jed.cod_p)*4; + + qua_u[2] = c3_calloc(sizeof(*qua_u[2])); + qua_u[2]->nam_c = strdup("hank cache"); + qua_u[2]->siz_w = u3h_mark(u3R->jed.han_p)*4; + + qua_u[3] = c3_calloc(sizeof(*qua_u[3])); + qua_u[3]->nam_c = strdup("battery hash cache"); + qua_u[3]->siz_w = u3h_mark(u3R->jed.bas_p)*4; + + qua_u[4] = c3_calloc(sizeof(*qua_u[4])); + qua_u[4]->nam_c = strdup("call site cache"); + u3h_walk_with(u3R->jed.han_p, _cj_mark_hank, &qua_u[4]->siz_w); + qua_u[4]->siz_w = qua_u[4]->siz_w*4; + + c3_w sum_w = 0; + for ( c3_w i_w = 0; i_w < 5; i_w++ ) { + sum_w += qua_u[i_w]->siz_w; } + quac* tot_u = c3_calloc(sizeof(*tot_u)); + tot_u->nam_c = strdup("total jet stuff"); + if ( u3R == &(u3H->rod_u) ) { - tot_w += u3a_maid(fil_u, " hot jet state", u3h_mark(u3R->jed.hot_p)); - } + qua_u[5] = c3_calloc(sizeof(*qua_u[5])); + qua_u[5]->nam_c = strdup("hot jet state"); + qua_u[5]->siz_w = u3h_mark(u3R->jed.hot_p)*4; + + sum_w += qua_u[5]->siz_w; + + tot_u->siz_w = sum_w; + tot_u->len_w = 6; + tot_u->qua_u = qua_u; - return u3a_maid(fil_u, "total jet stuff", tot_w); + return tot_u; + } else { + tot_u->siz_w = sum_w; + tot_u->len_w = 5; + tot_u->qua_u = qua_u; + + return tot_u; + } } /* u3j_free_hank(): free an entry from the hank cache. diff --git a/pkg/noun/jets.h b/pkg/noun/jets.h index 1440f7558a..02c2c54950 100644 --- a/pkg/noun/jets.h +++ b/pkg/noun/jets.h @@ -296,8 +296,8 @@ /* u3j_mark(): mark jet state for gc. */ - c3_w - u3j_mark(FILE* fil_u); + quac* + u3j_mark(); /* u3j_free(): free jet state. */ diff --git a/pkg/noun/manage.c b/pkg/noun/manage.c index d880a42032..7726ce6c2b 100644 --- a/pkg/noun/manage.c +++ b/pkg/noun/manage.c @@ -459,15 +459,16 @@ u3m_file(c3_c* pas_c) /* u3m_mark(): mark all nouns in the road. */ -c3_w -u3m_mark(FILE* fil_u) -{ - c3_w tot_w = 0; - tot_w += u3v_mark(fil_u); - tot_w += u3j_mark(fil_u); - tot_w += u3n_mark(fil_u); - tot_w += u3a_mark_road(fil_u); - return tot_w; +quac** +u3m_mark() +{ + quac** qua_u = c3_malloc(sizeof(*qua_u)*4); + qua_u[0] = u3v_mark(); + qua_u[1] = u3j_mark(); + qua_u[2] = u3n_mark(); + qua_u[3] = u3a_mark_road(); + + return qua_u; } /* _pave_parts(): build internal tables. diff --git a/pkg/noun/manage.h b/pkg/noun/manage.h index 14f9cc503d..ce1e66a86b 100644 --- a/pkg/noun/manage.h +++ b/pkg/noun/manage.h @@ -148,10 +148,20 @@ u3_noun u3m_soft_esc(u3_noun ref, u3_noun sam); + + /* quac: memory report. + */ + typedef struct _quac { + c3_c* nam_c; + c3_w siz_w; + c3_w len_w; + struct _quac** qua_u; + } quac; + /* u3m_mark(): mark all nouns in the road. */ - c3_w - u3m_mark(FILE* fil_u); + quac** + u3m_mark(); /* u3m_grab(): garbage-collect the world, plus extra roots. */ diff --git a/pkg/noun/nock.c b/pkg/noun/nock.c index 452b04c309..60f713f831 100644 --- a/pkg/noun/nock.c +++ b/pkg/noun/nock.c @@ -3047,16 +3047,29 @@ _n_bam(u3_noun kev, void* dat) /* u3n_mark(): mark the bytecode cache for gc. */ -c3_w -u3n_mark(FILE* fil_u) +quac* +u3n_mark() { - c3_w bam_w = 0, har_w = 0; + quac** qua_u = c3_malloc(sizeof(*qua_u)*2); + + qua_u[0] = c3_calloc(sizeof(*qua_u[0])); + qua_u[0]->nam_c = strdup("bytecode programs"); + u3p(u3h_root) har_p = u3R->byc.har_p; - u3h_walk_with(har_p, _n_bam, &bam_w); + u3h_walk_with(har_p, _n_bam, &qua_u[0]->siz_w); + qua_u[0]->siz_w = qua_u[0]->siz_w*4; + + qua_u[1] = c3_calloc(sizeof(*qua_u[1])); + qua_u[1]->nam_c = strdup("bytecode cache"); + qua_u[1]->siz_w = u3h_mark(har_p)*4; + + quac* tot_u = c3_malloc(sizeof(*tot_u)); + tot_u->nam_c = strdup("total nock stuff"); + tot_u->siz_w = qua_u[0]->siz_w + qua_u[1]->siz_w; + tot_u->len_w = 2; + tot_u->qua_u = qua_u; - bam_w = u3a_maid(fil_u, " bytecode programs", bam_w); - har_w = u3a_maid(fil_u, " bytecode cache", u3h_mark(har_p)); - return u3a_maid(fil_u, "total nock stuff", bam_w + har_w); + return tot_u; } /* u3n_reclaim(): clear ad-hoc persistent caches to reclaim memory. diff --git a/pkg/noun/nock.h b/pkg/noun/nock.h index 7baf7351d8..19489d304c 100644 --- a/pkg/noun/nock.h +++ b/pkg/noun/nock.h @@ -123,8 +123,8 @@ /* u3n_mark(): mark bytecode cache. */ - c3_w - u3n_mark(FILE* fil_u); + quac* + u3n_mark(); /* u3n_reclaim(): clear ad-hoc persistent caches to reclaim memory. */ diff --git a/pkg/noun/vortex.c b/pkg/noun/vortex.c index 0bb9af8c29..8de5b2bd88 100644 --- a/pkg/noun/vortex.c +++ b/pkg/noun/vortex.c @@ -392,16 +392,32 @@ u3v_sway(u3_noun blu, c3_l tab_l, u3_noun tax) /* u3v_mark(): mark arvo kernel. */ -c3_w -u3v_mark(FILE* fil_u) +quac* +u3v_mark() { u3v_arvo* arv_u = &(u3H->arv_u); - c3_w tot_w = 0; - tot_w += u3a_maid(fil_u, " kernel", u3a_mark_noun(arv_u->roc)); - tot_w += u3a_maid(fil_u, " date", u3a_mark_noun(arv_u->now)); - tot_w += u3a_maid(fil_u, " wish cache", u3a_mark_noun(arv_u->yot)); - return u3a_maid(fil_u, "total arvo stuff", tot_w); + quac** qua_u = c3_malloc(sizeof(*qua_u)*3); + + qua_u[0] = c3_calloc(sizeof(*qua_u[0])); + qua_u[0]->nam_c = strdup("kernel"); + qua_u[0]->siz_w = u3a_mark_noun(arv_u->roc)*4; + + qua_u[1] = c3_calloc(sizeof(*qua_u[1])); + qua_u[1]->nam_c = strdup("date"); + qua_u[1]->siz_w = u3a_mark_noun(arv_u->now)*4; + + qua_u[2] = c3_calloc(sizeof(*qua_u[2])); + qua_u[2]->nam_c = strdup("wish cache"); + qua_u[2]->siz_w = u3a_mark_noun(arv_u->yot)*4; + + quac* tot_u = c3_malloc(sizeof(*tot_u)); + tot_u->nam_c = strdup("total arvo stuff"); + tot_u->siz_w = qua_u[0]->siz_w + qua_u[1]->siz_w + qua_u[2]->siz_w; + tot_u->len_w = 3; + tot_u->qua_u = qua_u; + + return tot_u; } /* u3v_reclaim(): clear ad-hoc persistent caches to reclaim memory. diff --git a/pkg/noun/vortex.h b/pkg/noun/vortex.h index 2d202e831c..c254b32625 100644 --- a/pkg/noun/vortex.h +++ b/pkg/noun/vortex.h @@ -130,8 +130,8 @@ /* u3v_mark(): mark arvo kernel. */ - c3_w - u3v_mark(FILE* fil_u); + quac* + u3v_mark(); /* u3v_reclaim(): clear ad-hoc persistent caches to reclaim memory. */ diff --git a/pkg/vere/io/term.c b/pkg/vere/io/term.c index ff3f11efd2..927116bc62 100644 --- a/pkg/vere/io/term.c +++ b/pkg/vere/io/term.c @@ -1639,6 +1639,17 @@ _reck_orchid(u3_noun fot, u3_noun txt, c3_l* tid_l) } } +/* _term_io_quiz(): handle quiz (query to serf). +*/ +static void +_term_io_quiz(void* vod_p, u3_noun res) +{ + u3_auto* car_u = (u3_auto*)vod_p; + u3_noun wir = u3nt(c3__term, '1', u3_nul); + u3_noun cad = u3k(res); + u3_auto_plan(car_u, u3_ovum_init(0, c3__d, wir, cad)); +} + /* _term_io_kick(): apply effects. */ static c3_o @@ -1727,6 +1738,19 @@ _term_io_kick(u3_auto* car_u, u3_noun wir, u3_noun cad) ret_o = c3y; u3_pier_pack(car_u->pir_u); } break; + + case c3__quac: { + ret_o = c3y; + // construct and send writ here + u3_writ* wit_u = c3_calloc(sizeof(*wit_u)); + wit_u->typ_e = u3_writ_quiz; + wit_u->qui_u.ptr_v = car_u; + u3l_log("car_u: %p", car_u); + wit_u->qui_u.quiz_f = _term_io_quiz; + + lord_writ_plan(u3K.pir_u->god_u, wit_u); + + } break; } } } diff --git a/pkg/vere/king.c b/pkg/vere/king.c index 66181c6f6a..c1ddfd9d04 100644 --- a/pkg/vere/king.c +++ b/pkg/vere/king.c @@ -1653,7 +1653,6 @@ u3_king_bail(void) void u3_king_grab(void* vod_p) { - c3_w tot_w = 0; FILE* fil_u; u3_assert( u3R == &(u3H->rod_u) ); @@ -1689,11 +1688,32 @@ u3_king_grab(void* vod_p) } #endif - tot_w += u3m_mark(fil_u); - tot_w += u3_pier_mark(fil_u); + quac** all_u = c3_malloc(sizeof(*all_u)*6); - u3a_print_memory(fil_u, "total marked", tot_w); - u3a_print_memory(fil_u, "sweep", u3a_sweep()); + quac** var_u = u3m_mark(); + all_u[0] = var_u[0]; + all_u[1] = var_u[1]; + all_u[2] = var_u[2]; + all_u[3] = var_u[3]; + c3_free(var_u); + + c3_w tot_w = all_u[0]->siz_w + all_u[1]->siz_w + + all_u[2]->siz_w + all_u[3]->siz_w; + + all_u[4] = c3_calloc(sizeof(*all_u[4])); + all_u[4]->nam_c = "total marked"; + all_u[4]->siz_w = tot_w; + + all_u[5] = c3_calloc(sizeof(*all_u[5])); + all_u[5]->nam_c = "sweep"; + all_u[5]->siz_w = u3a_sweep(); + + for ( c3_w i_w = 0; i_w < 6; i_w++ ) { + u3a_print_quac(fil_u, 0, all_u[i_w]); + u3a_quac_free(all_u[i_w]); + } + + c3_free(all_u); #ifdef U3_MEMORY_LOG { diff --git a/pkg/vere/lord.c b/pkg/vere/lord.c index da739f838d..565dab2baf 100644 --- a/pkg/vere/lord.c +++ b/pkg/vere/lord.c @@ -23,6 +23,7 @@ [%peek mil=@ sam=*] :: gang (each path $%([%once @tas @tas path] [%beam @tas beam])) [%play eve=@ lit=(list ?((pair @da ovum) *))] [%work mil=@ job=(pair @da ovum)] + [%quiz $%([%quac ~])] == :: +plea: from serf to king :: @@ -31,6 +32,7 @@ [%ripe [pro=%1 hon=@ nok=@] eve=@ mug=@] [%slog pri=@ tank] [%flog cord] + [%quiz $%([%quac p=*])] $: %peek $% [%done dat=(unit (cask))] [%bail dud=goof] @@ -521,32 +523,16 @@ _lord_plea_play(u3_lord* god_u, u3_noun dat) u3z(dat); } -/* _lord_plea_mass(): inject mass report +/* _lord_plea_quiz(): handle quiz (query to serf). */ static void -_lord_plea_mass(u3_lord* god_u, u3_noun dat) +_lord_plea_quiz(u3_lord* god_u, u3_noun dat) { - u3_noun cad = u3nc(c3__quac, dat); - u3_noun wir = u3nc(c3__quac, u3_nul); - u3_ovum* egg_u = u3_ovum_init(0, c3__k, wir, cad); - - u3_pier* pir_u = god_u->cb_u.ptr_v; - u3_auto* car_u = c3_calloc(sizeof(*car_u)); - u3_noun ovo; - - car_u->pir_u = pir_u; - car_u->nam_m = c3__quac; // is that right? I did it by analogy w/ smth - - u3_auto_plan(car_u, egg_u); - - u3_assert( u3_auto_next(car_u, &ovo) == egg_u ); - - { - struct timeval tim_tv; - gettimeofday(&tim_tv, 0); - u3_lord_work(god_u, egg_u, u3nc(u3_time_in_tv(&tim_tv), ovo)); - } + u3_writ* wit_u = _lord_writ_need(god_u, u3_writ_quiz); + wit_u->qui_u.quiz_f(wit_u->qui_u.ptr_v, dat); + u3z(dat); } + /* _lord_work_spin(): update spinner if more work is in progress. */ static void @@ -769,9 +755,8 @@ _lord_on_plea(void* ptr_v, c3_d len_d, c3_y* byt_y) _lord_plea_ripe(god_u, u3k(dat)); } break; - case c3__quac: { - _lord_plea_mass(god_u, u3k(dat)); - + case c3__quiz: { + _lord_plea_quiz(god_u, u3k(dat)); } break; } @@ -843,6 +828,10 @@ _lord_writ_make(u3_lord* god_u, u3_writ* wit_u) // msg = u3nt(c3__live, c3__exit, 0); } break; + + case u3_writ_quiz: { + msg = u3nt(c3__quiz, c3__quac, u3_nul); + } break; } return msg; @@ -880,10 +869,10 @@ _lord_writ_send(u3_lord* god_u, u3_writ* wit_u) } } -/* _lord_writ_plan(): enqueue a writ and send. +/* lord_writ_plan(): enqueue a writ and send. */ -static void -_lord_writ_plan(u3_lord* god_u, u3_writ* wit_u) +void +lord_writ_plan(u3_lord* god_u, u3_writ* wit_u) { if ( !god_u->ent_u ) { u3_assert( !god_u->ext_u ); @@ -936,7 +925,7 @@ u3_lord_peek(u3_lord* god_u, u3_pico* pic_u) // XX cache check, unless last // - _lord_writ_plan(god_u, wit_u); + lord_writ_plan(god_u, wit_u); } /* u3_lord_play(): recompute batch. @@ -952,7 +941,7 @@ u3_lord_play(u3_lord* god_u, u3_info fon_u) // // u3_assert( !pay_u.ent_u->nex_u ); - _lord_writ_plan(god_u, wit_u); + lord_writ_plan(god_u, wit_u); } /* u3_lord_work(): attempt work. @@ -974,7 +963,7 @@ u3_lord_work(u3_lord* god_u, u3_ovum* egg_u, u3_noun job) god_u->pin_o = c3y; } - _lord_writ_plan(god_u, wit_u); + lord_writ_plan(god_u, wit_u); } /* u3_lord_save(): save a snapshot. @@ -988,7 +977,7 @@ u3_lord_save(u3_lord* god_u) else { u3_writ* wit_u = _lord_writ_new(god_u); wit_u->typ_e = u3_writ_save; - _lord_writ_plan(god_u, wit_u); + lord_writ_plan(god_u, wit_u); return c3y; } } @@ -1004,7 +993,7 @@ u3_lord_cram(u3_lord* god_u) else { u3_writ* wit_u = _lord_writ_new(god_u); wit_u->typ_e = u3_writ_cram; - _lord_writ_plan(god_u, wit_u); + lord_writ_plan(god_u, wit_u); return c3y; } } @@ -1016,7 +1005,7 @@ u3_lord_meld(u3_lord* god_u) { u3_writ* wit_u = _lord_writ_new(god_u); wit_u->typ_e = u3_writ_meld; - _lord_writ_plan(god_u, wit_u); + lord_writ_plan(god_u, wit_u); } /* u3_lord_pack(): defragment persistent state. @@ -1026,7 +1015,7 @@ u3_lord_pack(u3_lord* god_u) { u3_writ* wit_u = _lord_writ_new(god_u); wit_u->typ_e = u3_writ_pack; - _lord_writ_plan(god_u, wit_u); + lord_writ_plan(god_u, wit_u); } /* u3_lord_exit(): shutdown gracefully. @@ -1036,7 +1025,7 @@ u3_lord_exit(u3_lord* god_u) { u3_writ* wit_u = _lord_writ_new(god_u); wit_u->typ_e = u3_writ_exit; - _lord_writ_plan(god_u, wit_u); + lord_writ_plan(god_u, wit_u); // XX set timer, then halt } diff --git a/pkg/vere/main.c b/pkg/vere/main.c index 012f4afdfa..dd92507f81 100644 --- a/pkg/vere/main.c +++ b/pkg/vere/main.c @@ -1041,10 +1041,7 @@ _cw_serf_writ(void* vod_p, c3_d len_d, c3_y* byt_y) // all references must now be counted, and all roots recorded // - u3_weak serf_post_out = u3_serf_post(&u3V); - if (serf_post_out != u3_none) { - _cw_serf_send(u3nc(c3__quac, serf_post_out)); - } + u3_serf_post(&u3V); } } @@ -1734,7 +1731,7 @@ _cw_grab(c3_i argc, c3_c* argv[]) u3m_boot(u3_Host.dir_c, (size_t)1 << u3_Host.ops_u.lom_y); u3C.wag_w |= u3o_hashless; - u3_serf_grab(); + u3z(u3_serf_grab(c3y)); u3m_stop(); } @@ -3114,6 +3111,7 @@ main(c3_i argc, // validate whether we can execute disk migration if ( u3_Host.ops_u.nuu == c3n ) { _cw_play_impl(0, 0, c3n, c3n, c3n); + signal(SIGTSTP, _stop_exit); // XX unmap loom, else parts of the snapshot could be left in memory } diff --git a/pkg/vere/serf.c b/pkg/vere/serf.c index 2304a926da..658a0b9ff3 100644 --- a/pkg/vere/serf.c +++ b/pkg/vere/serf.c @@ -26,13 +26,14 @@ :: next steps: [%peek mil=@ sam=*] :: gang (each path $%([%once @tas @tas path] [beam @tas beam])) [%play eve=@ lit=(list ?((pair @da ovum) *))] [%work mil=@ job=(pair @da ovum)] + [%quiz $%([%quac ~])] == :: +plea: from serf to king :: +$ plea $% [%live ~] [%ripe [pro=%1 hon=@ nok=@] eve=@ mug=@] - [%quac p=(unit *)] + [%quiz $%([%quac p=*])] [%slog pri=@ tank] [%flog cord] $: %peek @@ -71,20 +72,61 @@ enum { _serf_fag_vega = 1 << 4 // kernel reset }; +/* _serf_quac: convert a quac to a noun. +*/ +u3_noun +_serf_quac(quac* mas_u) +{ + u3_noun list = u3_nul; + for ( c3_w i_w = 0; i_w < mas_u->len_w; i_w++ ) { + list = u3nc(_serf_quac(mas_u->qua_u[i_w]), list); + } + list = u3kb_flop(list); + + u3_noun mas = u3nt(u3i_string(mas_u->nam_c), u3i_word(mas_u->siz_w), list); + + c3_free(mas_u->nam_c); + c3_free(mas_u->qua_u); + c3_free(mas_u); + + return mas; +} + +/* _serf_quacs: convert an array of quacs to a noun list. +*/ +u3_noun +_serf_quacs(c3_w len_w, quac** all_u) +{ + u3_noun list = u3_nul; + for ( c3_w i_w = 0; i_w < len_w; i_w++ ) { + list = u3nc(_serf_quac(all_u[i_w]), list); + } + c3_free(all_u); + return u3kb_flop(list); +} + +/* _serf_print_quacs: print an array of quacs. +*/ +void +_serf_print_quacs(FILE* fil_u, c3_w len_w, quac** all_u) +{ + for ( c3_w i_w = 0; i_w < len_w; i_w++) { + u3a_print_quac(fil_u, 0, all_u[i_w]); + } +} + /* _serf_grab(): garbage collect, checking for profiling. RETAIN. */ -static u3_weak -_serf_grab(u3_noun sac) +static u3_noun +_serf_grab(u3_noun sac, c3_o pri_o) { - u3_noun out = u3_none; if ( u3_nul == sac) { if ( u3C.wag_w & (u3o_debug_ram | u3o_check_corrupt) ) { u3m_grab(sac, u3_none); } - return u3_none; + return u3_nul; } else { - c3_w tot_w = 0; FILE* fil_u; #ifdef U3_MEMORY_LOG @@ -116,38 +158,70 @@ _serf_grab(u3_noun sac) #endif u3_assert( u3R == &(u3H->rod_u) ); - fprintf(fil_u, "\r\n"); - tot_w += u3a_maid(fil_u, "total userspace", u3a_prof(fil_u, 0, sac)); - tot_w += u3m_mark(fil_u); - tot_w += u3a_maid(fil_u, "space profile", u3a_mark_noun(sac)); - - u3a_print_memory(fil_u, "total marked", tot_w); - u3a_print_memory(fil_u, "free lists", u3a_idle(u3R)); - u3a_print_memory(fil_u, "sweep", u3a_sweep()); - - fflush(fil_u); + quac* pro_u = u3a_prof(fil_u, sac); + if ( NULL == pro_u ) { + fflush(fil_u); + u3z(sac); + return u3_nul; + } else { + quac** all_u = c3_malloc(sizeof(*all_u)*9); + all_u[0] = pro_u; + + quac** var_u = u3m_mark(); + all_u[1] = var_u[0]; + all_u[2] = var_u[1]; + all_u[3] = var_u[2]; + all_u[4] = var_u[3]; + c3_free(var_u); + + c3_w tot_w = all_u[0]->siz_w + all_u[1]->siz_w + all_u[2]->siz_w + + all_u[3]->siz_w + all_u[4]->siz_w; + + all_u[5] = c3_calloc(sizeof(*all_u[5])); + all_u[5]->nam_c = strdup("space profile"); + all_u[5]->siz_w = u3a_mark_noun(sac)*4; + + tot_w += all_u[5]->siz_w; + + all_u[6] = c3_calloc(sizeof(*all_u[6])); + all_u[6]->nam_c = strdup("total marked"); + all_u[6]->siz_w = tot_w; + + all_u[7] = c3_calloc(sizeof(*all_u[7])); + all_u[7]->nam_c = strdup("free lists"); + all_u[7]->siz_w = u3a_idle(u3R)*4; + + all_u[8] = c3_calloc(sizeof(*all_u[8])); + all_u[8]->nam_c = strdup("sweep"); + all_u[8]->siz_w = u3a_sweep()*4; + + if ( c3y == pri_o ) { + _serf_print_quacs(fil_u, 9, all_u); + } + fflush(fil_u); #ifdef U3_MEMORY_LOG - { - fclose(fil_u); - } + { + fclose(fil_u); + } #endif - u3z(sac); - - u3l_log(""); + u3_noun mas = _serf_quacs(9, all_u); + u3z(sac); - return u3i_word(tot_w * 4); + return mas; + } } } /* u3_serf_grab(): garbage collect. */ -void -u3_serf_grab(void) +u3_noun +u3_serf_grab(c3_o pri_o) { u3_noun sac = u3_nul; + u3_noun res = u3_nul; u3_assert( u3R == &(u3H->rod_u) ); @@ -177,30 +251,40 @@ u3_serf_grab(void) u3z(gon); } - + fprintf(stderr, "serf: measuring memory:\r\n"); - fprintf(stderr, "BEFORE sac FORK:\r\n"); if ( u3_nul != sac ) { - printf("enter _serf_grab\r\n"); - _serf_grab(sac); + res = _serf_grab(sac, pri_o); } else { fprintf(stderr, "sac is empty\r\n"); - u3a_print_memory(stderr, "total marked", u3m_mark(stderr)); + quac** var_u = u3m_mark(); + + c3_w tot_w; + tot_w = var_u[0]->siz_w + var_u[1]->siz_w + + var_u[2]->siz_w + var_u[3]->siz_w; + + for ( c3_w i_w = 0; i_w < 4; i_w++ ) { + u3a_quac_free(var_u[i_w]); + } + c3_free(var_u); + + u3a_print_memory(stderr, "total marked", tot_w/4); u3a_print_memory(stderr, "free lists", u3a_idle(u3R)); u3a_print_memory(stderr, "sweep", u3a_sweep()); fprintf(stderr, "\r\n"); } - + fflush(stderr); + + return res; } /* u3_serf_post(): update serf state post-writ. */ -u3_weak +void u3_serf_post(u3_serf* sef_u) { - u3_noun out = u3_none; if ( sef_u->fag_w & _serf_fag_hit1 ) { if ( u3C.wag_w & u3o_verbose ) { u3l_log("serf: threshold 1: %u", u3h_wyt(u3R->cax.per_p)); @@ -221,11 +305,8 @@ u3_serf_post(u3_serf* sef_u) // XX this runs on replay too, |mass s/b elsewhere // if ( sef_u->fag_w & _serf_fag_mute ) { - u3_weak grab_mass = _serf_grab(sef_u->sac); + u3z(_serf_grab(sef_u->sac, c3y)); sef_u->sac = u3_nul; - if (grab_mass != u3_none) { - out = u3nc(u3_nul, grab_mass); - } } if ( sef_u->fag_w & _serf_fag_hit0 ) { @@ -243,7 +324,6 @@ u3_serf_post(u3_serf* sef_u) } sef_u->fag_w = _serf_fag_none; - return out; } /* _serf_curb(): check for memory threshold @@ -918,7 +998,7 @@ u3_serf_live(u3_serf* sef_u, u3_noun com, u3_noun* ret) } u3m_save(); - u3_serf_grab(); + u3_serf_grab(c3y); *ret = u3nc(c3__live, u3_nul); return c3y; @@ -1036,10 +1116,22 @@ u3_serf_writ(u3_serf* sef_u, u3_noun wit, u3_noun* pel) ret_o = c3y; } } break; + case c3__quiz: { + u3z(wit); + u3_noun res = u3_serf_grab(c3n); + if ( u3_none == res ) { + ret_o = c3n; + } else { + *pel = u3nt(c3__quiz, c3__quac, res); + ret_o = c3y; + } + } break; } } - u3z(wit); + if ( tag != c3__quiz ) { + u3z(wit); + } return ret_o; } diff --git a/pkg/vere/serf.h b/pkg/vere/serf.h index bfd86e5765..973538f4f1 100644 --- a/pkg/vere/serf.h +++ b/pkg/vere/serf.h @@ -51,12 +51,18 @@ /* u3_serf_post(): update serf state post-writ. */ - u3_weak + void u3_serf_post(u3_serf* sef_u); /* u3_serf_grab(): garbage collect. + */ + u3_noun + u3_serf_grab(c3_o pri_o); + + /* u3_quac_free(): free quac memory. */ void - u3_serf_grab(void); + u3a_quac_free(quac* qua_u); + #endif /* ifndef U3_VERE_SERF_H */ diff --git a/pkg/vere/vere.h b/pkg/vere/vere.h index f97a76f142..9d4cfdb5aa 100644 --- a/pkg/vere/vere.h +++ b/pkg/vere/vere.h @@ -456,7 +456,8 @@ u3_writ_cram = 4, u3_writ_meld = 5, u3_writ_pack = 6, - u3_writ_exit = 7 + u3_writ_exit = 7, + u3_writ_quiz = 8 } u3_writ_type; /* u3_writ: ipc message from king to serf @@ -472,6 +473,10 @@ u3_peek* pek_u; // peek u3_info fon_u; // recompute c3_d eve_d; // save/pack at + struct { // serf query: + void* ptr_v; // driver + void (*quiz_f)(void*, u3_noun); // callback + } qui_u; // }; } u3_writ; @@ -765,7 +770,11 @@ u3_atom u3_time_t_in_ts(time_t tim); #endif - + /* lord_writ_plan(): enqueue a writ and send. + */ + void + lord_writ_plan(u3_lord* god_u, u3_writ* wit_u); + /* u3_time_out_ts(): struct timespec from urbit time. */ void From a6a4d0995a30b3ff0320c8c49d4b2e78e1154516 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Thu, 11 Jul 2024 14:07:02 -0400 Subject: [PATCH 45/97] http: cleanup ranges, content range header --- pkg/vere/io/http.c | 100 ++++++++++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 41 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index f22b86679e..18e4928e3c 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -645,6 +645,23 @@ _http_cache_respond(u3_hreq* req_u, u3_noun nun); static void _http_scry_respond(u3_hreq* req_u, u3_noun nun); +static u3_noun +_content_rng(c3_z start_z, c3_z end_z, c3_w total_w) +{ + u3_noun out; + + u3_noun lin = u3i_list(u3i_string("bytes "), + u3do("crip", u3do("a-co:co", start_z)), + c3_s1('-'), + u3do("crip", u3do("a-co:co", end_z)), + c3_s1('/'), + u3do("crip", u3do("a-co:co", total_w)), + u3_none); + u3_atom dat = u3qc_rap(3, lin); + out = u3nc(u3i_string("Content-Range"), dat); + return out; +} + /* _http_foo_cb() */ static void @@ -654,11 +671,18 @@ _http_foo_cb(void* vod_p, u3_noun nun) u3_httd* htd_u = peq_u->htd_u; u3_hreq* req_u = peq_u->req_u; - // TODO range if ( req_u ) { u3_assert(u3_rsat_peek == req_u->sat_e); req_u->peq_u = 0; - _http_scry_respond(req_u, u3k(nun)); + if ( u3_nul != nun ) { + u3_atom lent = u3r_at(254, nun); + u3_noun con_rng_hed = _content_rng(0, (lent - 1), lent); + u3_noun mun = u3i_edit(u3k(nun), 125, u3nc(con_rng_hed, u3r_at(125, nun))); + _http_scry_respond(req_u, u3k(mun)); + } + else { + _http_scry_respond(req_u, u3k(nun)); + } } if ( peq_u->las_o == c3n ) { @@ -700,8 +724,12 @@ _find_tis_fas(void* txt, c3_w len) // [x] test stream // [x] content range function // [x] video controls -// [ ] fix repeated 1 byte requests -// [ ] multipart ranges? +// [x] fix repeated 1 byte requests +// [ ] disappearing headers +// [ ] fix 200 +// [ ] content-length +// [ ] mite +// [ ] don't crash, check multipart ranges // typedef struct _range_header { c3_z start_z; @@ -721,19 +749,19 @@ _slice_mime(range_header rng, u3_noun octs) c3_w oct_w = u3t(octs); content out; + out.start_z = SIZE_MAX; + out.end_z = SIZE_MAX; + out.dat = u3_nul; + if ( rng.start_z == SIZE_MAX ) { if ( rng.end_z == SIZE_MAX ) { // [~ ~] - out.start_z = SIZE_MAX; - out.end_z = SIZE_MAX; - out.dat = u3_nul; + return out; } else { // [~ @] if ( rng.end_z > lent_w ) { - out.start_z = SIZE_MAX; - out.end_z = SIZE_MAX; - out.dat = u3_nul; + return out; } else { // slice last bytes @@ -747,28 +775,24 @@ _slice_mime(range_header rng, u3_noun octs) else if ( rng.end_z == SIZE_MAX ) { // [@ ~] if ( rng.start_z > lent_w ) { - out.start_z = SIZE_MAX; - out.end_z = SIZE_MAX; - out.dat = u3_nul; + return out; } else { out.start_z = rng.start_z; - out.end_z = lent_w; - out.dat = u3nc(lent_w - rng.start_z, - u3qc_cut(3, rng.start_z, rng.end_z, oct_w)); + out.end_z = lent_w - 1; + out.dat = u3nc((out.end_z - out.start_z) + 1, + u3qc_cut(3, out.start_z, (out.end_z + 1) - out.start_z, oct_w)); } } else if (rng.end_z > lent_w) { - out.start_z = SIZE_MAX; - out.end_z = SIZE_MAX; - out.dat = u3_nul; + return out; } else { // [@ @] out.start_z = rng.start_z; - out.end_z = (rng.end_z - rng.start_z) + 1; - out.dat = u3nc((rng.end_z - rng.start_z) + 1, - u3qc_cut(3, out.start_z, out.end_z, oct_w)); + out.end_z = rng.end_z; + out.dat = u3nc((out.end_z - out.start_z) + 1, + u3qc_cut(3, out.start_z, (out.end_z + 1) - out.start_z, oct_w)); } return out; } @@ -794,23 +818,6 @@ _get_range(c3_c* txt_c, c3_w len_w) return slice; } -static u3_noun -_content_rng(c3_z start_z, c3_z end_z, c3_w lent_w) -{ - u3_noun out; - - u3_noun lin = u3i_list(u3i_string("bytes "), - u3do("crip", u3do("a-co:co", start_z)), - c3_s1('-'), - u3do("crip", u3do("a-co:co", end_z)), - c3_s1('/'), - u3do("crip", u3do("a-co:co", ++lent_w )), - u3_none); - u3_atom dat = u3qc_rap(3, lin); - out = u3nc(u3i_string("Content-Range"), dat); - return out; -} - /* _http_req_dispatch(): dispatch http request to %eyre */ static void @@ -980,12 +987,19 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); return; } - // if ( u3r_sing(result, octs) == c3y) { + + // if ( u3r_sing(result.dat, octs) == c3y) { // // 200 - // _http_cache_respond(req_u, nac); + // u3l_log("sending 200"); + // u3_atom lent = u3r_at(254, nac); + // u3_noun con_rng_hed = _content_rng(0, lent, lent); + // u3_noun mac = u3i_edit(nac, 125, u3nc(con_rng_hed, u3r_at(125, nac))); + // _http_cache_respond(req_u, mac); + // // _http_cache_respond(req_u, nac); // } // else { // 206 + u3l_log("sending 206"); u3_noun con_rng_hed = _content_rng(result.start_z, result.end_z, u3h(octs)); u3_noun res = u3i_edit(nac, 127, result.dat); res = u3i_edit(res, 124, 206); @@ -1033,7 +1047,9 @@ _http_scry_respond(u3_hreq* req_u, u3_noun nun) { } else { u3_noun auth, response_header, data; + // XX check: looks good u3x_qual(u3k(u3t(u3t(nun))), &auth, 0, &response_header, &data); + // u3m_p("res-header", response_header); u3_noun status, headers; u3x_cell(response_header, &status, &headers); @@ -1255,6 +1271,7 @@ _http_start_respond(u3_hreq* req_u, u3_noun data, u3_noun complete) { + u3m_p("start_respond headers", headers); if ( u3_rsat_plan != req_u->sat_e ) { u3l_log("http: %%start not sane"); u3z(status); u3z(headers); u3z(data); u3z(complete); @@ -1280,6 +1297,7 @@ _http_start_respond(u3_hreq* req_u, c3_i has_len_i = 0; while ( 0 != hed_u ) { + // u3l_log("start_respond header: %s", hed_u->nam_c); if ( 0x200 <= rec_u->version ) { h2o_strtolower(hed_u->nam_c, hed_u->nam_w); From 485f75c560c975d1ad7bae20478542e54961ff67 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Thu, 11 Jul 2024 15:10:37 -0400 Subject: [PATCH 46/97] http: cleanup slice_mime --- pkg/vere/io/http.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 18e4928e3c..c3f742fe03 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -766,9 +766,7 @@ _slice_mime(range_header rng, u3_noun octs) else { // slice last bytes out.start_z = lent_w - rng.end_z; - out.end_z = rng.end_z; - out.dat = u3nc(rng.end_z, - u3qc_cut(3, out.start_z, out.end_z, oct_w)); + out.end_z = lent_w - 1; } } } @@ -780,8 +778,6 @@ _slice_mime(range_header rng, u3_noun octs) else { out.start_z = rng.start_z; out.end_z = lent_w - 1; - out.dat = u3nc((out.end_z - out.start_z) + 1, - u3qc_cut(3, out.start_z, (out.end_z + 1) - out.start_z, oct_w)); } } else if (rng.end_z > lent_w) { @@ -791,6 +787,10 @@ _slice_mime(range_header rng, u3_noun octs) // [@ @] out.start_z = rng.start_z; out.end_z = rng.end_z; + } + if ( (out.start_z <= lent_w) && + (out.end_z <= lent_w) && + (out.start_z <= out.end_z) ) { out.dat = u3nc((out.end_z - out.start_z) + 1, u3qc_cut(3, out.start_z, (out.end_z + 1) - out.start_z, oct_w)); } From 3698ec77720156c05937dc30f45cb290837a36c0 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Thu, 11 Jul 2024 15:19:31 -0400 Subject: [PATCH 47/97] http: slice_mime lth not lte --- pkg/vere/io/http.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index c3f742fe03..9afc340eec 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -788,8 +788,8 @@ _slice_mime(range_header rng, u3_noun octs) out.start_z = rng.start_z; out.end_z = rng.end_z; } - if ( (out.start_z <= lent_w) && - (out.end_z <= lent_w) && + if ( (out.start_z < lent_w) && + (out.end_z < lent_w) && (out.start_z <= out.end_z) ) { out.dat = u3nc((out.end_z - out.start_z) + 1, u3qc_cut(3, out.start_z, (out.end_z + 1) - out.start_z, oct_w)); From 5f92f2501e085567f53f0f683f8f6fca842d6dc3 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Fri, 12 Jul 2024 10:37:33 -0400 Subject: [PATCH 48/97] http: fix headers: working --- pkg/vere/io/http.c | 47 +++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 9afc340eec..c0e7db1ae8 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -646,9 +646,11 @@ static void _http_scry_respond(u3_hreq* req_u, u3_noun nun); static u3_noun -_content_rng(c3_z start_z, c3_z end_z, c3_w total_w) +_content_headers(c3_z start_z, c3_z end_z, c3_w total_w) { u3_noun out; + u3_noun con_rng; + u3_noun con_len; u3_noun lin = u3i_list(u3i_string("bytes "), u3do("crip", u3do("a-co:co", start_z)), @@ -658,7 +660,11 @@ _content_rng(c3_z start_z, c3_z end_z, c3_w total_w) u3do("crip", u3do("a-co:co", total_w)), u3_none); u3_atom dat = u3qc_rap(3, lin); - out = u3nc(u3i_string("Content-Range"), dat); + con_rng = u3nc(u3i_string("Content-Range"), dat); + con_len = u3nc(u3i_string("Content-Length"), + u3do("crip", u3do("a-co:co", (end_z - start_z) + 1))); + + out = u3i_list(con_rng, con_len, u3_none); return out; } @@ -676,8 +682,8 @@ _http_foo_cb(void* vod_p, u3_noun nun) req_u->peq_u = 0; if ( u3_nul != nun ) { u3_atom lent = u3r_at(254, nun); - u3_noun con_rng_hed = _content_rng(0, (lent - 1), lent); - u3_noun mun = u3i_edit(u3k(nun), 125, u3nc(con_rng_hed, u3r_at(125, nun))); + u3_noun cont_heds = _content_headers(0, (lent - 1), lent); + u3_noun mun = u3i_edit(u3k(nun), 125, u3qb_weld(cont_heds, u3r_at(125, nun))); _http_scry_respond(req_u, u3k(mun)); } else { @@ -725,9 +731,9 @@ _find_tis_fas(void* txt, c3_w len) // [x] content range function // [x] video controls // [x] fix repeated 1 byte requests -// [ ] disappearing headers -// [ ] fix 200 -// [ ] content-length +// [x] disappearing headers +// [x] fix 200 +// [x] content-length // [ ] mite // [ ] don't crash, check multipart ranges // @@ -988,24 +994,21 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) return; } - // if ( u3r_sing(result.dat, octs) == c3y) { - // // 200 - // u3l_log("sending 200"); - // u3_atom lent = u3r_at(254, nac); - // u3_noun con_rng_hed = _content_rng(0, lent, lent); - // u3_noun mac = u3i_edit(nac, 125, u3nc(con_rng_hed, u3r_at(125, nac))); - // _http_cache_respond(req_u, mac); - // // _http_cache_respond(req_u, nac); - // } - // else { + if ( u3r_sing(result.dat, octs) == c3y) { + // 200 + u3_atom lent = u3r_at(254, nac); + u3_noun cont_heds = _content_headers(0, (lent - 1), lent); + u3_noun mac = u3i_edit(nac, 125, u3qb_weld(cont_heds, u3r_at(125, nac))); + _http_cache_respond(req_u, mac); + } + else { // 206 - u3l_log("sending 206"); - u3_noun con_rng_hed = _content_rng(result.start_z, result.end_z, u3h(octs)); + u3_noun cont_heds = _content_headers(result.start_z, result.end_z, u3h(octs)); u3_noun res = u3i_edit(nac, 127, result.dat); res = u3i_edit(res, 124, 206); - res = u3i_edit(res, 125, u3nc(con_rng_hed, u3r_at(125, res))); + res = u3i_edit(res, 125, u3qb_weld(cont_heds, u3r_at(125, res))); _http_cache_respond(req_u, res); - // } + } } } else { @@ -1271,7 +1274,6 @@ _http_start_respond(u3_hreq* req_u, u3_noun data, u3_noun complete) { - u3m_p("start_respond headers", headers); if ( u3_rsat_plan != req_u->sat_e ) { u3l_log("http: %%start not sane"); u3z(status); u3z(headers); u3z(data); u3z(complete); @@ -1297,7 +1299,6 @@ _http_start_respond(u3_hreq* req_u, c3_i has_len_i = 0; while ( 0 != hed_u ) { - // u3l_log("start_respond header: %s", hed_u->nam_c); if ( 0x200 <= rec_u->version ) { h2o_strtolower(hed_u->nam_c, hed_u->nam_w); From 717bae5daed175fc74dd34448892923e15bc0dbb Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Fri, 12 Jul 2024 12:48:37 -0500 Subject: [PATCH 49/97] mass: `quac` cleanup --- pkg/noun/allocate.c | 24 +++++++++++------------- pkg/noun/allocate.h | 11 ++++++++--- pkg/noun/jets.c | 10 +++++----- pkg/noun/jets.h | 2 +- pkg/noun/manage.c | 6 +++--- pkg/noun/manage.h | 10 +++++----- pkg/noun/nock.c | 6 +++--- pkg/noun/nock.h | 2 +- pkg/noun/vortex.c | 7 +++---- pkg/noun/vortex.h | 2 +- pkg/vere/king.c | 6 +++--- pkg/vere/serf.c | 30 +++++++++++++++--------------- pkg/vere/serf.h | 5 ----- 13 files changed, 59 insertions(+), 62 deletions(-) diff --git a/pkg/noun/allocate.c b/pkg/noun/allocate.c index 620d5fac5c..f17392f4f0 100644 --- a/pkg/noun/allocate.c +++ b/pkg/noun/allocate.c @@ -9,8 +9,6 @@ #include "retrieve.h" #include "trace.h" #include "vortex.h" -#include "noun.h" -#include "defs.h" u3_road* u3a_Road; @@ -2032,7 +2030,7 @@ _ca_print_memory(FILE* fil_u, c3_w byt_w) /* u3a_quac_free: free quac memory. */ void -u3a_quac_free(quac* qua_u) +u3a_quac_free(u3m_quac* qua_u) { for ( c3_w i_w = 0; i_w < qua_u->len_w; i_w++ ) { u3a_quac_free(qua_u->qua_u[i_w]); @@ -2044,11 +2042,11 @@ u3a_quac_free(quac* qua_u) /* u3a_prof(): mark/measure/print memory profile. RETAIN. */ -quac* +u3m_quac* u3a_prof(FILE* fil_u, u3_noun mas) { // c3_w tot_w = 0; - quac* pro_u = c3_calloc(sizeof(*pro_u)); + u3m_quac* pro_u = c3_calloc(sizeof(*pro_u)); u3_noun h_mas, t_mas; if ( c3n == u3r_cell(mas, &h_mas, &t_mas) ) { @@ -2120,7 +2118,7 @@ u3a_prof(FILE* fil_u, u3_noun mas) c3_w i_w = 2; c3_t bad_t = 0; while ( c3y == u3du(tt_mas) ) { - quac* new_u = u3a_prof(fil_u, u3h(tt_mas)); + u3m_quac* new_u = u3a_prof(fil_u, u3h(tt_mas)); if ( NULL == new_u ) { c3_free(new_u); bad_t = 1; @@ -2133,7 +2131,7 @@ u3a_prof(FILE* fil_u, u3_noun mas) tt_mas = u3t(tt_mas); i_w++; } - + if ( bad_t ) { for ( i_w = 0; i_w < pro_u->len_w ; i_w++ ) { u3a_quac_free(pro_u->qua_u[i_w]); @@ -2155,14 +2153,14 @@ u3a_prof(FILE* fil_u, u3_noun mas) } -/* u3a_print_quac: print a quac memory report. +/* u3a_print_quac: print a memory report. */ void -u3a_print_quac(FILE* fil_u, c3_w den_w, quac* mas_u) +u3a_print_quac(FILE* fil_u, c3_w den_w, u3m_quac* mas_u) { u3_assert( 0 != fil_u ); - + if ( mas_u->siz_w ) { fprintf(fil_u, "%*s%s: ", den_w, "", mas_u->nam_c); @@ -2182,10 +2180,10 @@ u3a_print_quac(FILE* fil_u, c3_w den_w, quac* mas_u) /* u3a_mark_road(): mark ad-hoc persistent road structures. */ -quac* +u3m_quac* u3a_mark_road() { - quac** qua_u = c3_malloc(sizeof(*qua_u)*8); + u3m_quac** qua_u = c3_malloc(sizeof(*qua_u)*8); qua_u[0] = c3_calloc(sizeof(*qua_u[0])); qua_u[0]->nam_c = strdup("namespace"); @@ -2224,7 +2222,7 @@ u3a_mark_road() sum_w += qua_u[i_w]->siz_w; } - quac* tot_u = c3_malloc(sizeof(*tot_u)); + u3m_quac* tot_u = c3_malloc(sizeof(*tot_u)); tot_u->nam_c = strdup("total road stuff"); tot_u->siz_w = sum_w; tot_u->len_w = 8; diff --git a/pkg/noun/allocate.h b/pkg/noun/allocate.h index 7763fd3183..d0954f9656 100644 --- a/pkg/noun/allocate.h +++ b/pkg/noun/allocate.h @@ -606,7 +606,7 @@ /* u3a_mark_road(): mark ad-hoc persistent road structures. */ - quac* + u3m_quac* u3a_mark_road(); /* u3a_reclaim(): clear ad-hoc persistent caches to reclaim memory. @@ -705,7 +705,7 @@ /* u3a_print_quac: print a quac memory report. */ void - u3a_print_quac(FILE* fil_u, c3_w den_w, quac* mas_u); + u3a_print_quac(FILE* fil_u, c3_w den_w, u3m_quac* mas_u); /* u3a_print_memory(): print memory amount. */ @@ -713,7 +713,7 @@ u3a_print_memory(FILE* fil_u, c3_c* cap_c, c3_w wor_w); /* u3a_prof(): mark/measure/print memory profile. RETAIN. */ - quac* + u3m_quac* u3a_prof(FILE* fil_u, u3_noun mas); /* u3a_maid(): maybe print memory. @@ -721,6 +721,11 @@ c3_w u3a_maid(FILE* fil_u, c3_c* cap_c, c3_w wor_w); + /* u3a_quac_free(): free quac memory. + */ + void + u3a_quac_free(u3m_quac* qua_u); + /* u3a_uncap_print_memory(): un-captioned print memory amount. */ void diff --git a/pkg/noun/jets.c b/pkg/noun/jets.c index e64ce26c8a..2f1dc3a0df 100644 --- a/pkg/noun/jets.c +++ b/pkg/noun/jets.c @@ -2305,10 +2305,10 @@ _cj_mark_hank(u3_noun kev, void* dat) /* u3j_mark(): mark jet state for gc. */ -quac* +u3m_quac* u3j_mark() { - quac** qua_u = c3_malloc(sizeof(*qua_u)*6); + u3m_quac** qua_u = c3_malloc(sizeof(*qua_u)*6); qua_u[0] = c3_calloc(sizeof(*qua_u[0])); qua_u[0]->nam_c = strdup("warm jet state"); @@ -2325,18 +2325,18 @@ u3j_mark() qua_u[3] = c3_calloc(sizeof(*qua_u[3])); qua_u[3]->nam_c = strdup("battery hash cache"); qua_u[3]->siz_w = u3h_mark(u3R->jed.bas_p)*4; - + qua_u[4] = c3_calloc(sizeof(*qua_u[4])); qua_u[4]->nam_c = strdup("call site cache"); u3h_walk_with(u3R->jed.han_p, _cj_mark_hank, &qua_u[4]->siz_w); qua_u[4]->siz_w = qua_u[4]->siz_w*4; - + c3_w sum_w = 0; for ( c3_w i_w = 0; i_w < 5; i_w++ ) { sum_w += qua_u[i_w]->siz_w; } - quac* tot_u = c3_calloc(sizeof(*tot_u)); + u3m_quac* tot_u = c3_calloc(sizeof(*tot_u)); tot_u->nam_c = strdup("total jet stuff"); if ( u3R == &(u3H->rod_u) ) { diff --git a/pkg/noun/jets.h b/pkg/noun/jets.h index 02c2c54950..81301d1199 100644 --- a/pkg/noun/jets.h +++ b/pkg/noun/jets.h @@ -296,7 +296,7 @@ /* u3j_mark(): mark jet state for gc. */ - quac* + u3m_quac* u3j_mark(); /* u3j_free(): free jet state. diff --git a/pkg/noun/manage.c b/pkg/noun/manage.c index 7726ce6c2b..3a462ef9ea 100644 --- a/pkg/noun/manage.c +++ b/pkg/noun/manage.c @@ -459,15 +459,15 @@ u3m_file(c3_c* pas_c) /* u3m_mark(): mark all nouns in the road. */ -quac** +u3m_quac** u3m_mark() { - quac** qua_u = c3_malloc(sizeof(*qua_u)*4); + u3m_quac** qua_u = c3_malloc(sizeof(*qua_u)*4); qua_u[0] = u3v_mark(); qua_u[1] = u3j_mark(); qua_u[2] = u3n_mark(); qua_u[3] = u3a_mark_road(); - + return qua_u; } diff --git a/pkg/noun/manage.h b/pkg/noun/manage.h index ce1e66a86b..33c75b4ee4 100644 --- a/pkg/noun/manage.h +++ b/pkg/noun/manage.h @@ -149,18 +149,18 @@ u3m_soft_esc(u3_noun ref, u3_noun sam); - /* quac: memory report. + /* u3m_quac: memory report. */ - typedef struct _quac { + typedef struct _u3m_quac { c3_c* nam_c; c3_w siz_w; c3_w len_w; - struct _quac** qua_u; - } quac; + struct _u3m_quac** qua_u; + } u3m_quac; /* u3m_mark(): mark all nouns in the road. */ - quac** + u3m_quac** u3m_mark(); /* u3m_grab(): garbage-collect the world, plus extra roots. diff --git a/pkg/noun/nock.c b/pkg/noun/nock.c index 60f713f831..e7b1ab64be 100644 --- a/pkg/noun/nock.c +++ b/pkg/noun/nock.c @@ -3047,10 +3047,10 @@ _n_bam(u3_noun kev, void* dat) /* u3n_mark(): mark the bytecode cache for gc. */ -quac* +u3m_quac* u3n_mark() { - quac** qua_u = c3_malloc(sizeof(*qua_u)*2); + u3m_quac** qua_u = c3_malloc(sizeof(*qua_u)*2); qua_u[0] = c3_calloc(sizeof(*qua_u[0])); qua_u[0]->nam_c = strdup("bytecode programs"); @@ -3063,7 +3063,7 @@ u3n_mark() qua_u[1]->nam_c = strdup("bytecode cache"); qua_u[1]->siz_w = u3h_mark(har_p)*4; - quac* tot_u = c3_malloc(sizeof(*tot_u)); + u3m_quac* tot_u = c3_malloc(sizeof(*tot_u)); tot_u->nam_c = strdup("total nock stuff"); tot_u->siz_w = qua_u[0]->siz_w + qua_u[1]->siz_w; tot_u->len_w = 2; diff --git a/pkg/noun/nock.h b/pkg/noun/nock.h index 19489d304c..266438119d 100644 --- a/pkg/noun/nock.h +++ b/pkg/noun/nock.h @@ -123,7 +123,7 @@ /* u3n_mark(): mark bytecode cache. */ - quac* + u3m_quac* u3n_mark(); /* u3n_reclaim(): clear ad-hoc persistent caches to reclaim memory. diff --git a/pkg/noun/vortex.c b/pkg/noun/vortex.c index 8de5b2bd88..c49d73780f 100644 --- a/pkg/noun/vortex.c +++ b/pkg/noun/vortex.c @@ -392,12 +392,12 @@ u3v_sway(u3_noun blu, c3_l tab_l, u3_noun tax) /* u3v_mark(): mark arvo kernel. */ -quac* +u3m_quac* u3v_mark() { u3v_arvo* arv_u = &(u3H->arv_u); - quac** qua_u = c3_malloc(sizeof(*qua_u)*3); + u3m_quac** qua_u = c3_malloc(sizeof(*qua_u)*3); qua_u[0] = c3_calloc(sizeof(*qua_u[0])); qua_u[0]->nam_c = strdup("kernel"); @@ -411,7 +411,7 @@ u3v_mark() qua_u[2]->nam_c = strdup("wish cache"); qua_u[2]->siz_w = u3a_mark_noun(arv_u->yot)*4; - quac* tot_u = c3_malloc(sizeof(*tot_u)); + u3m_quac* tot_u = c3_malloc(sizeof(*tot_u)); tot_u->nam_c = strdup("total arvo stuff"); tot_u->siz_w = qua_u[0]->siz_w + qua_u[1]->siz_w + qua_u[2]->siz_w; tot_u->len_w = 3; @@ -450,4 +450,3 @@ u3v_rewrite_compact() arv_u->now = u3a_rewritten_noun(arv_u->now); arv_u->yot = u3a_rewritten_noun(arv_u->yot); } - diff --git a/pkg/noun/vortex.h b/pkg/noun/vortex.h index c254b32625..e2377fb9be 100644 --- a/pkg/noun/vortex.h +++ b/pkg/noun/vortex.h @@ -130,7 +130,7 @@ /* u3v_mark(): mark arvo kernel. */ - quac* + u3m_quac* u3v_mark(); /* u3v_reclaim(): clear ad-hoc persistent caches to reclaim memory. diff --git a/pkg/vere/king.c b/pkg/vere/king.c index c1ddfd9d04..668237245a 100644 --- a/pkg/vere/king.c +++ b/pkg/vere/king.c @@ -1688,9 +1688,9 @@ u3_king_grab(void* vod_p) } #endif - quac** all_u = c3_malloc(sizeof(*all_u)*6); + u3m_quac** all_u = c3_malloc(sizeof(*all_u)*6); - quac** var_u = u3m_mark(); + u3m_quac** var_u = u3m_mark(); all_u[0] = var_u[0]; all_u[1] = var_u[1]; all_u[2] = var_u[2]; @@ -1707,7 +1707,7 @@ u3_king_grab(void* vod_p) all_u[5] = c3_calloc(sizeof(*all_u[5])); all_u[5]->nam_c = "sweep"; all_u[5]->siz_w = u3a_sweep(); - + for ( c3_w i_w = 0; i_w < 6; i_w++ ) { u3a_print_quac(fil_u, 0, all_u[i_w]); u3a_quac_free(all_u[i_w]); diff --git a/pkg/vere/serf.c b/pkg/vere/serf.c index 658a0b9ff3..bf0d4ae949 100644 --- a/pkg/vere/serf.c +++ b/pkg/vere/serf.c @@ -75,7 +75,7 @@ enum { /* _serf_quac: convert a quac to a noun. */ u3_noun -_serf_quac(quac* mas_u) +_serf_quac(u3m_quac* mas_u) { u3_noun list = u3_nul; for ( c3_w i_w = 0; i_w < mas_u->len_w; i_w++ ) { @@ -84,7 +84,7 @@ _serf_quac(quac* mas_u) list = u3kb_flop(list); u3_noun mas = u3nt(u3i_string(mas_u->nam_c), u3i_word(mas_u->siz_w), list); - + c3_free(mas_u->nam_c); c3_free(mas_u->qua_u); c3_free(mas_u); @@ -95,7 +95,7 @@ _serf_quac(quac* mas_u) /* _serf_quacs: convert an array of quacs to a noun list. */ u3_noun -_serf_quacs(c3_w len_w, quac** all_u) +_serf_quacs(c3_w len_w, u3m_quac** all_u) { u3_noun list = u3_nul; for ( c3_w i_w = 0; i_w < len_w; i_w++ ) { @@ -108,7 +108,7 @@ _serf_quacs(c3_w len_w, quac** all_u) /* _serf_print_quacs: print an array of quacs. */ void -_serf_print_quacs(FILE* fil_u, c3_w len_w, quac** all_u) +_serf_print_quacs(FILE* fil_u, c3_w len_w, u3m_quac** all_u) { for ( c3_w i_w = 0; i_w < len_w; i_w++) { u3a_print_quac(fil_u, 0, all_u[i_w]); @@ -159,25 +159,25 @@ _serf_grab(u3_noun sac, c3_o pri_o) u3_assert( u3R == &(u3H->rod_u) ); - quac* pro_u = u3a_prof(fil_u, sac); + u3m_quac* pro_u = u3a_prof(fil_u, sac); if ( NULL == pro_u ) { fflush(fil_u); u3z(sac); return u3_nul; } else { - quac** all_u = c3_malloc(sizeof(*all_u)*9); + u3m_quac** all_u = c3_malloc(sizeof(*all_u)*9); all_u[0] = pro_u; - quac** var_u = u3m_mark(); + u3m_quac** var_u = u3m_mark(); all_u[1] = var_u[0]; all_u[2] = var_u[1]; all_u[3] = var_u[2]; all_u[4] = var_u[3]; c3_free(var_u); - + c3_w tot_w = all_u[0]->siz_w + all_u[1]->siz_w + all_u[2]->siz_w + all_u[3]->siz_w + all_u[4]->siz_w; - + all_u[5] = c3_calloc(sizeof(*all_u[5])); all_u[5]->nam_c = strdup("space profile"); all_u[5]->siz_w = u3a_mark_noun(sac)*4; @@ -195,7 +195,7 @@ _serf_grab(u3_noun sac, c3_o pri_o) all_u[8] = c3_calloc(sizeof(*all_u[8])); all_u[8]->nam_c = strdup("sweep"); all_u[8]->siz_w = u3a_sweep()*4; - + if ( c3y == pri_o ) { _serf_print_quacs(fil_u, 9, all_u); } @@ -251,14 +251,14 @@ u3_serf_grab(c3_o pri_o) u3z(gon); } - + fprintf(stderr, "serf: measuring memory:\r\n"); if ( u3_nul != sac ) { res = _serf_grab(sac, pri_o); } else { fprintf(stderr, "sac is empty\r\n"); - quac** var_u = u3m_mark(); + u3m_quac** var_u = u3m_mark(); c3_w tot_w; tot_w = var_u[0]->siz_w + var_u[1]->siz_w @@ -274,9 +274,9 @@ u3_serf_grab(c3_o pri_o) u3a_print_memory(stderr, "sweep", u3a_sweep()); fprintf(stderr, "\r\n"); } - + fflush(stderr); - + return res; } @@ -1120,7 +1120,7 @@ u3_serf_writ(u3_serf* sef_u, u3_noun wit, u3_noun* pel) u3z(wit); u3_noun res = u3_serf_grab(c3n); if ( u3_none == res ) { - ret_o = c3n; + ret_o = c3n; } else { *pel = u3nt(c3__quiz, c3__quac, res); ret_o = c3y; diff --git a/pkg/vere/serf.h b/pkg/vere/serf.h index 973538f4f1..0645434015 100644 --- a/pkg/vere/serf.h +++ b/pkg/vere/serf.h @@ -59,10 +59,5 @@ u3_noun u3_serf_grab(c3_o pri_o); - /* u3_quac_free(): free quac memory. - */ - void - u3a_quac_free(quac* qua_u); - #endif /* ifndef U3_VERE_SERF_H */ From 89b7f30aa306fc049465e190412ce2be58f80102 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Fri, 12 Jul 2024 14:08:48 -0400 Subject: [PATCH 50/97] http: cleanup wip --- pkg/vere/io/http.c | 218 ++++++++++++++++++++------------------------- 1 file changed, 99 insertions(+), 119 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index c0e7db1ae8..4ca232bd2e 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -645,26 +645,28 @@ _http_cache_respond(u3_hreq* req_u, u3_noun nun); static void _http_scry_respond(u3_hreq* req_u, u3_noun nun); +/* _content_headers: create content headers for response +*/ static u3_noun -_content_headers(c3_z start_z, c3_z end_z, c3_w total_w) +_content_headers(c3_z beg_z, c3_z end_z, c3_w tot_w) { u3_noun out; - u3_noun con_rng; - u3_noun con_len; + u3_noun rng; + u3_noun len; u3_noun lin = u3i_list(u3i_string("bytes "), - u3do("crip", u3do("a-co:co", start_z)), + u3do("crip", u3do("a-co:co", beg_z)), c3_s1('-'), u3do("crip", u3do("a-co:co", end_z)), c3_s1('/'), - u3do("crip", u3do("a-co:co", total_w)), + u3do("crip", u3do("a-co:co", tot_w)), u3_none); u3_atom dat = u3qc_rap(3, lin); - con_rng = u3nc(u3i_string("Content-Range"), dat); - con_len = u3nc(u3i_string("Content-Length"), - u3do("crip", u3do("a-co:co", (end_z - start_z) + 1))); + rng = u3nc(u3i_string("Content-Range"), dat); + len = u3nc(u3i_string("Content-Length"), + u3do("crip", u3do("a-co:co", (end_z - beg_z) + 1))); - out = u3i_list(con_rng, con_len, u3_none); + out = u3i_list(rng, len, u3_none); return out; } @@ -681,9 +683,9 @@ _http_foo_cb(void* vod_p, u3_noun nun) u3_assert(u3_rsat_peek == req_u->sat_e); req_u->peq_u = 0; if ( u3_nul != nun ) { - u3_atom lent = u3r_at(254, nun); - u3_noun cont_heds = _content_headers(0, (lent - 1), lent); - u3_noun mun = u3i_edit(u3k(nun), 125, u3qb_weld(cont_heds, u3r_at(125, nun))); + u3_atom len = u3r_at(254, nun); + u3_noun hez = _content_headers(0, (len - 1), len); + u3_noun mun = u3i_edit(u3k(nun), 125, u3qb_weld(hez, u3r_at(125, nun))); _http_scry_respond(req_u, u3k(mun)); } else { @@ -698,52 +700,13 @@ _http_foo_cb(void* vod_p, u3_noun nun) c3_free(peq_u); } -static c3_c* -_find_tis_fas(void* txt, c3_w len) -{ - c3_c* tis = memchr(txt, '=', len); - c3_c* fas = memchr(txt, '/', len); - - if ( tis && fas ) { - return c3_min(tis, fas); - } - else if ( tis ) { - return tis; - } - else { - return fas; - } -} - -// TODO -// [x] don't blow up on bad paths -// [x] authentication -// [x] caching -// [x] insert mime in path -// [x] range header -// [x] u3qc_cut -// [x] better range header parsing -// [x] better slicing -// [x] 206 -// [x] 200 -// [x] open range vs error -// [x] test stream -// [x] content range function -// [x] video controls -// [x] fix repeated 1 byte requests -// [x] disappearing headers -// [x] fix 200 -// [x] content-length -// [ ] mite -// [ ] don't crash, check multipart ranges -// typedef struct _range_header { - c3_z start_z; + c3_z beg_z; c3_z end_z; } range_header; typedef struct _content { - c3_z start_z; + c3_z beg_z; c3_z end_z; u3_noun dat; } content; @@ -751,54 +714,55 @@ typedef struct _content { static content _slice_mime(range_header rng, u3_noun octs) { - c3_w lent_w = u3h(octs); + c3_w len_w = u3h(octs); c3_w oct_w = u3t(octs); content out; - out.start_z = SIZE_MAX; + out.beg_z = SIZE_MAX; out.end_z = SIZE_MAX; out.dat = u3_nul; - if ( rng.start_z == SIZE_MAX ) { + if ( rng.beg_z == SIZE_MAX ) { if ( rng.end_z == SIZE_MAX ) { // [~ ~] return out; } else { // [~ @] - if ( rng.end_z > lent_w ) { + if ( rng.end_z > len_w ) { return out; } else { // slice last bytes - out.start_z = lent_w - rng.end_z; - out.end_z = lent_w - 1; + out.beg_z = len_w - rng.end_z; + out.end_z = len_w - 1; } } } else if ( rng.end_z == SIZE_MAX ) { // [@ ~] - if ( rng.start_z > lent_w ) { + if ( rng.beg_z > len_w ) { return out; } else { - out.start_z = rng.start_z; - out.end_z = lent_w - 1; + out.beg_z = rng.beg_z; + out.end_z = len_w - 1; } } - else if (rng.end_z > lent_w) { + else if (rng.end_z > len_w) { return out; } else { // [@ @] - out.start_z = rng.start_z; + out.beg_z = rng.beg_z; out.end_z = rng.end_z; } - if ( (out.start_z < lent_w) && - (out.end_z < lent_w) && - (out.start_z <= out.end_z) ) { - out.dat = u3nc((out.end_z - out.start_z) + 1, - u3qc_cut(3, out.start_z, (out.end_z + 1) - out.start_z, oct_w)); + if ( (out.beg_z < len_w) + && (out.end_z < len_w) + && (out.beg_z <= out.end_z) ) + { + out.dat = u3nc((out.end_z - out.beg_z) + 1, + u3qc_cut(3, out.beg_z, (out.end_z + 1) - out.beg_z, oct_w)); } return out; } @@ -808,16 +772,16 @@ _get_range(c3_c* txt_c, c3_w len_w) { c3_c* hep_c = memchr(txt_c, '-', len_w); range_header slice; - slice.start_z = SIZE_MAX; + slice.beg_z = SIZE_MAX; slice.end_z = SIZE_MAX; if ( hep_c ) { - slice.start_z = h2o_strtosize(txt_c, hep_c - txt_c); + slice.beg_z = h2o_strtosize(txt_c, hep_c - txt_c); slice.end_z = h2o_strtosize(hep_c + 1, len_w - ((hep_c + 1) - txt_c)); // strange -> [SIZE_MAX SIZE_MAX] so we return u3_nul in _slice_mime - if ( ((hep_c != txt_c) && (slice.start_z == SIZE_MAX)) || + if ( ((hep_c != txt_c) && (slice.beg_z == SIZE_MAX)) || ((len_w - ((hep_c + 1) - txt_c) > 0) && (slice.end_z == SIZE_MAX)) ) { - slice.start_z = SIZE_MAX; + slice.beg_z = SIZE_MAX; slice.end_z = SIZE_MAX; } } @@ -838,8 +802,8 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) u3_noun wir = _http_req_to_duct(req_u); u3_noun cad; - c3_c* base = req_u->rec_u->input.path.base; - c3_w len = req_u->rec_u->input.path.len; + c3_c* bas_c = req_u->rec_u->input.path.base; + c3_w len_w = req_u->rec_u->input.path.len; { u3_noun adr = u3nc(c3__ipv4, u3i_words(1, &req_u->hon_u->ipf_w)); @@ -852,9 +816,14 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) : u3nc(u3i_string("request"), dat); } - if ( (len >= 5) && (base[1] == '_') && (base[2] == '~') && (base[3] == '_') && (base[4] == '/')) { - base = base + 4; // retain '/' after /_~_ - len = len - 4; + if ( (len_w >= 5) + && (bas_c[1] == '_') + && (bas_c[2] == '~') + && (bas_c[3] == '_') + && (bas_c[4] == '/') ) + { + bas_c = bas_c + 4; // retain '/' after /_~_ + len_w = len_w - 4; req_u->peq_u = c3_malloc(sizeof(*req_u->peq_u)); req_u->peq_u->req_u = req_u; @@ -881,65 +850,78 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) u3_noun cas; c3_o last = c3n; - c3_w i; - // get beak from path // - for (i = 0; i < 3; ++i) { - u3_noun* where; - if ( i == 0 ) { - where = &who; + for ( c3_w i_w = 0; i_w < 3; ++i_w ) { + u3_noun* wer; + if ( 0 == i_w ) { + wer = &who; } - else if ( i == 1 ) { - where = &des; + else if ( 1 == i_w ) { + wer = &des; } else { - where = &cas; + wer = &cas; } // find '//' - if ( len >= 2 && base[0] == '/' && base[1] == '/' ) { - *where = u3_nul; - base++; - len--; + if ( (len_w >= 2) + && ('/' == bas_c[0]) + && ('/' == bas_c[1]) ) + { + *wer = u3_nul; + bas_c++; + len_w--; } // skip '/' - else if ( len > 0 && base[0] == '/' ) { - base++; - len--; + else if ( (len_w > 0) && ('/' == bas_c[0]) ) { + bas_c++; + len_w--; } // '=' - if ( len > 0 && base[0] == '=' ) { - if ( i == 0 ) { - *where = our; + if ( (len_w > 0) && ('=' == bas_c[0]) ) { + if ( 0 == i_w ) { + *wer = our; } - else if ( i == 1 ) { - *where = u3i_string("base"); + else if ( 1 == i_w ) { + *wer = u3i_string("base"); } else { last = c3y; } - base++; - len--; + bas_c++; + len_w--; } // slice cord else { - c3_c* nex = _find_tis_fas(base, len); - if ( !nex ) { + c3_c* nex_c; + c3_c* tis_c = memchr(bas_c, '=', len_w); + c3_c* fas_c = memchr(bas_c, '/', len_w); + if ( tis_c && fas_c ) { + nex_c = c3_min(tis_c, fas_c); + } + else if ( tis_c ) { + nex_c = tis_c; + } + else { + nex_c = fas_c; + } + + if ( !nex_c ) { c3_c* msg_c = "bad beam"; h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); return; } else { - c3_d mylen = nex - base; - *where = u3i_bytes(mylen, base); - base = nex; - len = len - mylen; + c3_d len_d = nex_c - bas_c; + *wer = u3i_bytes(len_d, (const c3_y*)bas_c); + bas_c = nex_c; + len_w = len_w - len_d; } } } - u3_noun spur = u3dc("rush", u3i_bytes(len, (const c3_y*)base), u3v_wish("stap")); + u3_noun spur = u3dc("rush", u3i_bytes(len_w, (const c3_y*)bas_c), u3v_wish("stap")); if ( (who != our) || (spur == u3_nul) ) { c3_c* msg_c = "bad scry path"; @@ -972,23 +954,23 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) if (idx != UINT32_MAX) { if ( (req_headers.entries[idx].value.len >= 6) && - (memcmp("bytes=", req_headers.entries[idx].value.base, 6) == 0 )) { + (0 == memcmp("bytes=", req_headers.entries[idx].value.base, 6)) ) { c3_w rest_len = req_headers.entries[idx].value.len - 6; - if ( rest_len == 0) { + if ( 0 == rest_len ) { c3_c* msg_c = "Requested Range Not Satisfiable"; h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); return; } range_header rng_hed = _get_range(req_headers.entries[idx].value.base + 6, rest_len); u3_noun octs = u3r_at(127, nac); - if ( octs == u3_none) { + if ( u3_none == octs ) { c3_c* msg_c = "Requested Range Not Satisfiable"; h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); return; } content result = _slice_mime(rng_hed, octs); - if ( result.dat == u3_nul) { + if ( u3_nul == result.dat ) { c3_c* msg_c = "Requested Range Not Satisfiable"; h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); return; @@ -996,17 +978,17 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) if ( u3r_sing(result.dat, octs) == c3y) { // 200 - u3_atom lent = u3r_at(254, nac); - u3_noun cont_heds = _content_headers(0, (lent - 1), lent); - u3_noun mac = u3i_edit(nac, 125, u3qb_weld(cont_heds, u3r_at(125, nac))); + u3_atom len = u3r_at(254, nac); + u3_noun hez = _content_headers(0, (len - 1), len); + u3_noun mac = u3i_edit(nac, 125, u3qb_weld(hez, u3r_at(125, nac))); _http_cache_respond(req_u, mac); } else { // 206 - u3_noun cont_heds = _content_headers(result.start_z, result.end_z, u3h(octs)); + u3_noun hez = _content_headers(result.beg_z, result.end_z, u3h(octs)); u3_noun res = u3i_edit(nac, 127, result.dat); res = u3i_edit(res, 124, 206); - res = u3i_edit(res, 125, u3qb_weld(cont_heds, u3r_at(125, res))); + res = u3i_edit(res, 125, u3qb_weld(hez, u3r_at(125, res))); _http_cache_respond(req_u, res); } } @@ -1050,9 +1032,7 @@ _http_scry_respond(u3_hreq* req_u, u3_noun nun) { } else { u3_noun auth, response_header, data; - // XX check: looks good u3x_qual(u3k(u3t(u3t(nun))), &auth, 0, &response_header, &data); - // u3m_p("res-header", response_header); u3_noun status, headers; u3x_cell(response_header, &status, &headers); From e0a961a26b4de0458d052fa970af8087109c224b Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Mon, 15 Jul 2024 23:33:28 -0400 Subject: [PATCH 51/97] http: cleanup, headers --- pkg/vere/io/http.c | 75 +++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 4ca232bd2e..c8496e90d0 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -651,6 +651,7 @@ static u3_noun _content_headers(c3_z beg_z, c3_z end_z, c3_w tot_w) { u3_noun out; + u3_noun byz; u3_noun rng; u3_noun len; @@ -662,8 +663,8 @@ _content_headers(c3_z beg_z, c3_z end_z, c3_w tot_w) u3do("crip", u3do("a-co:co", tot_w)), u3_none); u3_atom dat = u3qc_rap(3, lin); - rng = u3nc(u3i_string("Content-Range"), dat); - len = u3nc(u3i_string("Content-Length"), + rng = u3nc(u3i_string("content-range"), dat); + len = u3nc(u3i_string("content-length"), u3do("crip", u3do("a-co:co", (end_z - beg_z) + 1))); out = u3i_list(rng, len, u3_none); @@ -685,25 +686,26 @@ _http_foo_cb(void* vod_p, u3_noun nun) if ( u3_nul != nun ) { u3_atom len = u3r_at(254, nun); u3_noun hez = _content_headers(0, (len - 1), len); - u3_noun mun = u3i_edit(u3k(nun), 125, u3qb_weld(hez, u3r_at(125, nun))); - _http_scry_respond(req_u, u3k(mun)); + u3_noun res = u3i_edit(u3k(nun), 125, u3qb_weld(hez, u3r_at(125, nun))); + _http_scry_respond(req_u, u3k(res)); } else { _http_scry_respond(req_u, u3k(nun)); } } - if ( peq_u->las_o == c3n ) { + // cache only if peek was not at now + if ( c3n == peq_u->las_o ) { u3h_put(htd_u->nax_p, peq_u->pax, nun); } u3z(peq_u->pax); c3_free(peq_u); } -typedef struct _range_header { +typedef struct _range_request { c3_z beg_z; c3_z end_z; -} range_header; +} range_request; typedef struct _content { c3_z beg_z; @@ -712,7 +714,7 @@ typedef struct _content { } content; static content -_slice_mime(range_header rng, u3_noun octs) +_slice_mime(range_request rng, u3_noun octs) { c3_w len_w = u3h(octs); c3_w oct_w = u3t(octs); @@ -722,8 +724,8 @@ _slice_mime(range_header rng, u3_noun octs) out.end_z = SIZE_MAX; out.dat = u3_nul; - if ( rng.beg_z == SIZE_MAX ) { - if ( rng.end_z == SIZE_MAX ) { + if ( SIZE_MAX == rng.beg_z ) { + if ( SIZE_MAX == rng.end_z ) { // [~ ~] return out; } @@ -739,7 +741,7 @@ _slice_mime(range_header rng, u3_noun octs) } } } - else if ( rng.end_z == SIZE_MAX ) { + else if ( SIZE_MAX == rng.end_z ) { // [@ ~] if ( rng.beg_z > len_w ) { return out; @@ -767,25 +769,26 @@ _slice_mime(range_header rng, u3_noun octs) return out; } -static range_header -_get_range(c3_c* txt_c, c3_w len_w) +static range_request +_parse_range(c3_c* txt_c, c3_w len_w) { c3_c* hep_c = memchr(txt_c, '-', len_w); - range_header slice; - slice.beg_z = SIZE_MAX; - slice.end_z = SIZE_MAX; + range_request cut; + cut.beg_z = SIZE_MAX; + cut.end_z = SIZE_MAX; if ( hep_c ) { - slice.beg_z = h2o_strtosize(txt_c, hep_c - txt_c); - slice.end_z = h2o_strtosize(hep_c + 1, len_w - ((hep_c + 1) - txt_c)); + cut.beg_z = h2o_strtosize(txt_c, hep_c - txt_c); + cut.end_z = h2o_strtosize(hep_c + 1, len_w - ((hep_c + 1) - txt_c)); // strange -> [SIZE_MAX SIZE_MAX] so we return u3_nul in _slice_mime - if ( ((hep_c != txt_c) && (slice.beg_z == SIZE_MAX)) || - ((len_w - ((hep_c + 1) - txt_c) > 0) && (slice.end_z == SIZE_MAX)) ) { - slice.beg_z = SIZE_MAX; - slice.end_z = SIZE_MAX; + if ( ((SIZE_MAX == cut.beg_z) && (hep_c != txt_c)) + || ((SIZE_MAX == cut.end_z) && (len_w - ((hep_c + 1) - txt_c) > 0)) ) + { + cut.beg_z = SIZE_MAX; + cut.end_z = SIZE_MAX; } } - return slice; + return cut; } /* _http_req_dispatch(): dispatch http request to %eyre @@ -817,10 +820,10 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) } if ( (len_w >= 5) - && (bas_c[1] == '_') - && (bas_c[2] == '~') - && (bas_c[3] == '_') - && (bas_c[4] == '/') ) + && ('_' == bas_c[1]) + && ('~' == bas_c[2]) + && ('_' == bas_c[3]) + && ('/' == bas_c[4]) ) { bas_c = bas_c + 4; // retain '/' after /_~_ len_w = len_w - 4; @@ -828,6 +831,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) req_u->peq_u = c3_malloc(sizeof(*req_u->peq_u)); req_u->peq_u->req_u = req_u; req_u->peq_u->htd_u = htd_u; + req_u->peq_u->las_o = c3n; req_u->sat_e = u3_rsat_peek; u3_hfig* fig_u = &req_u->hon_u->htp_u->htd_u->fig_u; @@ -848,7 +852,6 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) u3_noun who; u3_noun des; u3_noun cas; - c3_o last = c3n; // get beak from path // @@ -887,7 +890,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) *wer = u3i_string("base"); } else { - last = c3y; + req_u->peq_u->las_o = c3y; } bas_c++; len_w--; @@ -923,7 +926,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) u3_noun spur = u3dc("rush", u3i_bytes(len_w, (const c3_y*)bas_c), u3v_wish("stap")); - if ( (who != our) || (spur == u3_nul) ) { + if ( (who != our) || (u3_nul == spur) ) { c3_c* msg_c = "bad scry path"; h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); return; @@ -931,9 +934,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) else { spur = u3nc(u3i_string("mime"), u3t(spur)); - if ( c3y == last ) { - // DON'T CACHE - req_u->peq_u->las_o = c3y; + if ( c3y == req_u->peq_u->las_o ) { u3_pier_peek_last(htd_u->car_u.pir_u, gang, c3__ex, des, spur, req_u->peq_u, _http_foo_cb); } @@ -961,14 +962,14 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); return; } - range_header rng_hed = _get_range(req_headers.entries[idx].value.base + 6, rest_len); + range_request rng_req = _parse_range(req_headers.entries[idx].value.base + 6, rest_len); u3_noun octs = u3r_at(127, nac); if ( u3_none == octs ) { c3_c* msg_c = "Requested Range Not Satisfiable"; h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); return; } - content result = _slice_mime(rng_hed, octs); + content result = _slice_mime(rng_req, octs); if ( u3_nul == result.dat ) { c3_c* msg_c = "Requested Range Not Satisfiable"; @@ -980,8 +981,8 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) // 200 u3_atom len = u3r_at(254, nac); u3_noun hez = _content_headers(0, (len - 1), len); - u3_noun mac = u3i_edit(nac, 125, u3qb_weld(hez, u3r_at(125, nac))); - _http_cache_respond(req_u, mac); + u3_noun res = u3i_edit(nac, 125, u3qb_weld(hez, u3r_at(125, nac))); + _http_cache_respond(req_u, res); } else { // 206 From 10687827f81310dfd7039d924acbfa2d1785e8c8 Mon Sep 17 00:00:00 2001 From: Tinnus Napbus Date: Tue, 16 Jul 2024 18:24:57 +1200 Subject: [PATCH 52/97] mass: cleanup per matthew-levan feedback - init len_w in quac, use null term'd list instead - add spaces between arithmetic operators - publicise _lord_writ_new() as u3_lord_writ_new() - rename lord_writ_plan as u3_lord_writ_plan - remove superfluous printfs - misc tidying --- pkg/noun/allocate.c | 51 +++++++++++++++++++++++---------------------- pkg/noun/jets.c | 20 ++++++++++-------- pkg/noun/manage.c | 3 ++- pkg/noun/manage.h | 1 - pkg/noun/nock.c | 9 ++++---- pkg/noun/vortex.c | 11 +++++----- pkg/vere/io/term.c | 6 ++---- pkg/vere/lord.c | 42 ++++++++++++++++++------------------- pkg/vere/serf.c | 49 ++++++++++++++++++++++++++----------------- pkg/vere/vere.h | 9 ++++++-- 10 files changed, 110 insertions(+), 91 deletions(-) diff --git a/pkg/noun/allocate.c b/pkg/noun/allocate.c index f17392f4f0..60332b5b52 100644 --- a/pkg/noun/allocate.c +++ b/pkg/noun/allocate.c @@ -2032,8 +2032,10 @@ _ca_print_memory(FILE* fil_u, c3_w byt_w) void u3a_quac_free(u3m_quac* qua_u) { - for ( c3_w i_w = 0; i_w < qua_u->len_w; i_w++ ) { + c3_w i_w = 0; + while ( qua_u->qua_u[i_w] != NULL ) { u3a_quac_free(qua_u->qua_u[i_w]); + i_w++; } c3_free(qua_u->nam_c); c3_free(qua_u->qua_u); @@ -2045,7 +2047,6 @@ u3a_quac_free(u3m_quac* qua_u) u3m_quac* u3a_prof(FILE* fil_u, u3_noun mas) { -// c3_w tot_w = 0; u3m_quac* pro_u = c3_calloc(sizeof(*pro_u)); u3_noun h_mas, t_mas; @@ -2106,35 +2107,33 @@ u3a_prof(FILE* fil_u, u3_noun mas) #endif pro_u->nam_c = u3r_string(h_mas); pro_u->siz_w = siz_w*4; - pro_u->len_w = 0; - pro_u->qua_u = 0; + pro_u->qua_u = NULL; return pro_u; } else if ( c3n == it_mas ) { - fprintf(fil_u, "\r\n"); pro_u->qua_u = c3_malloc(sizeof(pro_u->qua_u)); - pro_u->len_w = 0; - c3_w i_w = 2; + c3_w i_w = 0; c3_t bad_t = 0; while ( c3y == u3du(tt_mas) ) { u3m_quac* new_u = u3a_prof(fil_u, u3h(tt_mas)); if ( NULL == new_u ) { - c3_free(new_u); bad_t = 1; } else { - pro_u->qua_u = c3_realloc(pro_u->qua_u, i_w*sizeof(pro_u->qua_u)); + pro_u->qua_u = c3_realloc(pro_u->qua_u, (i_w + 2) * sizeof(pro_u->qua_u)); pro_u->siz_w += new_u->siz_w; - pro_u->qua_u[pro_u->len_w] = new_u; - pro_u->len_w++; + pro_u->qua_u[i_w] = new_u; } tt_mas = u3t(tt_mas); i_w++; } + pro_u->qua_u[i_w] = NULL; if ( bad_t ) { - for ( i_w = 0; i_w < pro_u->len_w ; i_w++ ) { + i_w = 0; + while ( pro_u->qua_u[i_w] != NULL ) { u3a_quac_free(pro_u->qua_u[i_w]); + i_w++; } c3_free(pro_u->qua_u); c3_free(pro_u); @@ -2164,13 +2163,14 @@ u3a_print_quac(FILE* fil_u, c3_w den_w, u3m_quac* mas_u) if ( mas_u->siz_w ) { fprintf(fil_u, "%*s%s: ", den_w, "", mas_u->nam_c); - if ( mas_u->len_w == 0) { + if ( mas_u->qua_u == NULL ) { _ca_print_memory(fil_u, mas_u->siz_w); } else { fprintf(fil_u, "\r\n"); - c3_w i_w; - for ( i_w = 0; i_w < mas_u->len_w; i_w++ ) { + c3_w i_w = 0; + while ( mas_u->qua_u[i_w] != NULL ) { u3a_print_quac(fil_u, den_w+2, mas_u->qua_u[i_w]); + i_w++; } fprintf(fil_u, "%*s--", den_w, ""); _ca_print_memory(fil_u, mas_u->siz_w); @@ -2183,39 +2183,41 @@ u3a_print_quac(FILE* fil_u, c3_w den_w, u3m_quac* mas_u) u3m_quac* u3a_mark_road() { - u3m_quac** qua_u = c3_malloc(sizeof(*qua_u)*8); + u3m_quac** qua_u = c3_malloc(sizeof(*qua_u) * 9); qua_u[0] = c3_calloc(sizeof(*qua_u[0])); qua_u[0]->nam_c = strdup("namespace"); - qua_u[0]->siz_w = u3a_mark_noun(u3R->ski.gul)*4; + qua_u[0]->siz_w = u3a_mark_noun(u3R->ski.gul) * 4; qua_u[1] = c3_calloc(sizeof(*qua_u[1])); qua_u[1]->nam_c = strdup("trace stack"); - qua_u[1]->siz_w = u3a_mark_noun(u3R->ski.gul)*4; + qua_u[1]->siz_w = u3a_mark_noun(u3R->ski.gul) * 4; qua_u[2] = c3_calloc(sizeof(*qua_u[2])); qua_u[2]->nam_c = strdup("trace buffer"); - qua_u[2]->siz_w = u3a_mark_noun(u3R->bug.mer)*4; + qua_u[2]->siz_w = u3a_mark_noun(u3R->bug.mer) * 4; qua_u[3] = c3_calloc(sizeof(*qua_u[3])); qua_u[3]->nam_c = strdup("profile batteries"); - qua_u[3]->siz_w = u3a_mark_noun(u3R->pro.don)*4; + qua_u[3]->siz_w = u3a_mark_noun(u3R->pro.don) * 4; qua_u[4] = c3_calloc(sizeof(*qua_u[4])); qua_u[4]->nam_c = strdup("profile doss"); - qua_u[4]->siz_w = u3a_mark_noun(u3R->pro.day)*4; + qua_u[4]->siz_w = u3a_mark_noun(u3R->pro.day) * 4; qua_u[5] = c3_calloc(sizeof(*qua_u[5])); qua_u[5]->nam_c = strdup("new profile trace"); - qua_u[5]->siz_w = u3a_mark_noun(u3R->pro.trace)*4; + qua_u[5]->siz_w = u3a_mark_noun(u3R->pro.trace) * 4; qua_u[6] = c3_calloc(sizeof(*qua_u[6])); qua_u[6]->nam_c = strdup("transient memoization cache"); - qua_u[6]->siz_w = u3h_mark(u3R->cax.har_p)*4; + qua_u[6]->siz_w = u3h_mark(u3R->cax.har_p) * 4; qua_u[7] = c3_calloc(sizeof(*qua_u[7])); qua_u[7]->nam_c = strdup("persistent memoization cache"); - qua_u[7]->siz_w = u3h_mark(u3R->cax.per_p)*4; + qua_u[7]->siz_w = u3h_mark(u3R->cax.per_p) * 4; + + qua_u[8] = NULL; c3_w sum_w = 0; for (c3_w i_w = 0; i_w < 8; i_w++) { @@ -2225,7 +2227,6 @@ u3a_mark_road() u3m_quac* tot_u = c3_malloc(sizeof(*tot_u)); tot_u->nam_c = strdup("total road stuff"); tot_u->siz_w = sum_w; - tot_u->len_w = 8; tot_u->qua_u = qua_u; return tot_u; diff --git a/pkg/noun/jets.c b/pkg/noun/jets.c index 2f1dc3a0df..c0b17142ef 100644 --- a/pkg/noun/jets.c +++ b/pkg/noun/jets.c @@ -2308,28 +2308,28 @@ _cj_mark_hank(u3_noun kev, void* dat) u3m_quac* u3j_mark() { - u3m_quac** qua_u = c3_malloc(sizeof(*qua_u)*6); + u3m_quac** qua_u = c3_malloc(sizeof(*qua_u) * 7); qua_u[0] = c3_calloc(sizeof(*qua_u[0])); qua_u[0]->nam_c = strdup("warm jet state"); - qua_u[0]->siz_w = u3h_mark(u3R->jed.war_p)*4; + qua_u[0]->siz_w = u3h_mark(u3R->jed.war_p) * 4; qua_u[1] = c3_calloc(sizeof(*qua_u[1])); qua_u[1]->nam_c = strdup("cold jet state"); - qua_u[1]->siz_w = u3h_mark(u3R->jed.cod_p)*4; + qua_u[1]->siz_w = u3h_mark(u3R->jed.cod_p) * 4; qua_u[2] = c3_calloc(sizeof(*qua_u[2])); qua_u[2]->nam_c = strdup("hank cache"); - qua_u[2]->siz_w = u3h_mark(u3R->jed.han_p)*4; + qua_u[2]->siz_w = u3h_mark(u3R->jed.han_p) * 4; qua_u[3] = c3_calloc(sizeof(*qua_u[3])); qua_u[3]->nam_c = strdup("battery hash cache"); - qua_u[3]->siz_w = u3h_mark(u3R->jed.bas_p)*4; + qua_u[3]->siz_w = u3h_mark(u3R->jed.bas_p) * 4; qua_u[4] = c3_calloc(sizeof(*qua_u[4])); qua_u[4]->nam_c = strdup("call site cache"); u3h_walk_with(u3R->jed.han_p, _cj_mark_hank, &qua_u[4]->siz_w); - qua_u[4]->siz_w = qua_u[4]->siz_w*4; + qua_u[4]->siz_w *= 4; c3_w sum_w = 0; for ( c3_w i_w = 0; i_w < 5; i_w++ ) { @@ -2342,18 +2342,20 @@ u3j_mark() if ( u3R == &(u3H->rod_u) ) { qua_u[5] = c3_calloc(sizeof(*qua_u[5])); qua_u[5]->nam_c = strdup("hot jet state"); - qua_u[5]->siz_w = u3h_mark(u3R->jed.hot_p)*4; + qua_u[5]->siz_w = u3h_mark(u3R->jed.hot_p) * 4; sum_w += qua_u[5]->siz_w; + qua_u[6] = NULL; + tot_u->siz_w = sum_w; - tot_u->len_w = 6; tot_u->qua_u = qua_u; return tot_u; } else { + qua_u[5] = NULL; + tot_u->siz_w = sum_w; - tot_u->len_w = 5; tot_u->qua_u = qua_u; return tot_u; diff --git a/pkg/noun/manage.c b/pkg/noun/manage.c index 3a462ef9ea..3f94f34055 100644 --- a/pkg/noun/manage.c +++ b/pkg/noun/manage.c @@ -462,11 +462,12 @@ u3m_file(c3_c* pas_c) u3m_quac** u3m_mark() { - u3m_quac** qua_u = c3_malloc(sizeof(*qua_u)*4); + u3m_quac** qua_u = c3_malloc(sizeof(*qua_u) * 5); qua_u[0] = u3v_mark(); qua_u[1] = u3j_mark(); qua_u[2] = u3n_mark(); qua_u[3] = u3a_mark_road(); + qua_u[4] = NULL; return qua_u; } diff --git a/pkg/noun/manage.h b/pkg/noun/manage.h index 33c75b4ee4..974c5948f6 100644 --- a/pkg/noun/manage.h +++ b/pkg/noun/manage.h @@ -154,7 +154,6 @@ typedef struct _u3m_quac { c3_c* nam_c; c3_w siz_w; - c3_w len_w; struct _u3m_quac** qua_u; } u3m_quac; diff --git a/pkg/noun/nock.c b/pkg/noun/nock.c index e7b1ab64be..7530191c01 100644 --- a/pkg/noun/nock.c +++ b/pkg/noun/nock.c @@ -3050,23 +3050,24 @@ _n_bam(u3_noun kev, void* dat) u3m_quac* u3n_mark() { - u3m_quac** qua_u = c3_malloc(sizeof(*qua_u)*2); + u3m_quac** qua_u = c3_malloc(sizeof(*qua_u) * 3); qua_u[0] = c3_calloc(sizeof(*qua_u[0])); qua_u[0]->nam_c = strdup("bytecode programs"); u3p(u3h_root) har_p = u3R->byc.har_p; u3h_walk_with(har_p, _n_bam, &qua_u[0]->siz_w); - qua_u[0]->siz_w = qua_u[0]->siz_w*4; + qua_u[0]->siz_w = qua_u[0]->siz_w * 4; qua_u[1] = c3_calloc(sizeof(*qua_u[1])); qua_u[1]->nam_c = strdup("bytecode cache"); - qua_u[1]->siz_w = u3h_mark(har_p)*4; + qua_u[1]->siz_w = u3h_mark(har_p) * 4; + + qua_u[2] = NULL; u3m_quac* tot_u = c3_malloc(sizeof(*tot_u)); tot_u->nam_c = strdup("total nock stuff"); tot_u->siz_w = qua_u[0]->siz_w + qua_u[1]->siz_w; - tot_u->len_w = 2; tot_u->qua_u = qua_u; return tot_u; diff --git a/pkg/noun/vortex.c b/pkg/noun/vortex.c index c49d73780f..287be49e8c 100644 --- a/pkg/noun/vortex.c +++ b/pkg/noun/vortex.c @@ -397,24 +397,25 @@ u3v_mark() { u3v_arvo* arv_u = &(u3H->arv_u); - u3m_quac** qua_u = c3_malloc(sizeof(*qua_u)*3); + u3m_quac** qua_u = c3_malloc(sizeof(*qua_u) * 4); qua_u[0] = c3_calloc(sizeof(*qua_u[0])); qua_u[0]->nam_c = strdup("kernel"); - qua_u[0]->siz_w = u3a_mark_noun(arv_u->roc)*4; + qua_u[0]->siz_w = u3a_mark_noun(arv_u->roc) * 4; qua_u[1] = c3_calloc(sizeof(*qua_u[1])); qua_u[1]->nam_c = strdup("date"); - qua_u[1]->siz_w = u3a_mark_noun(arv_u->now)*4; + qua_u[1]->siz_w = u3a_mark_noun(arv_u->now) * 4; qua_u[2] = c3_calloc(sizeof(*qua_u[2])); qua_u[2]->nam_c = strdup("wish cache"); - qua_u[2]->siz_w = u3a_mark_noun(arv_u->yot)*4; + qua_u[2]->siz_w = u3a_mark_noun(arv_u->yot) * 4; + + qua_u[3] = NULL; u3m_quac* tot_u = c3_malloc(sizeof(*tot_u)); tot_u->nam_c = strdup("total arvo stuff"); tot_u->siz_w = qua_u[0]->siz_w + qua_u[1]->siz_w + qua_u[2]->siz_w; - tot_u->len_w = 3; tot_u->qua_u = qua_u; return tot_u; diff --git a/pkg/vere/io/term.c b/pkg/vere/io/term.c index 927116bc62..bf9789aabc 100644 --- a/pkg/vere/io/term.c +++ b/pkg/vere/io/term.c @@ -1741,14 +1741,12 @@ _term_io_kick(u3_auto* car_u, u3_noun wir, u3_noun cad) case c3__quac: { ret_o = c3y; - // construct and send writ here - u3_writ* wit_u = c3_calloc(sizeof(*wit_u)); + u3_writ* wit_u = u3_lord_writ_new(u3K.pir_u->god_u); wit_u->typ_e = u3_writ_quiz; wit_u->qui_u.ptr_v = car_u; - u3l_log("car_u: %p", car_u); wit_u->qui_u.quiz_f = _term_io_quiz; - lord_writ_plan(u3K.pir_u->god_u, wit_u); + u3_lord_writ_plan(u3K.pir_u->god_u, wit_u); } break; } diff --git a/pkg/vere/lord.c b/pkg/vere/lord.c index 565dab2baf..3553aa402f 100644 --- a/pkg/vere/lord.c +++ b/pkg/vere/lord.c @@ -763,10 +763,10 @@ _lord_on_plea(void* ptr_v, c3_d len_d, c3_y* byt_y) u3z(jar); } -/* _lord_writ_new(): allocate a new writ. +/* u3_lord_writ_new(): allocate a new writ. */ -static u3_writ* -_lord_writ_new(u3_lord* god_u) +u3_writ* +u3_lord_writ_new(u3_lord* god_u) { u3_writ* wit_u = c3_calloc(sizeof(*wit_u)); return wit_u; @@ -869,10 +869,10 @@ _lord_writ_send(u3_lord* god_u, u3_writ* wit_u) } } -/* lord_writ_plan(): enqueue a writ and send. +/* u3_lord_writ_plan(): enqueue a writ and send. */ void -lord_writ_plan(u3_lord* god_u, u3_writ* wit_u) +u3_lord_writ_plan(u3_lord* god_u, u3_writ* wit_u) { if ( !god_u->ent_u ) { u3_assert( !god_u->ext_u ); @@ -894,7 +894,7 @@ lord_writ_plan(u3_lord* god_u, u3_writ* wit_u) void u3_lord_peek(u3_lord* god_u, u3_pico* pic_u) { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_peek; wit_u->pek_u = c3_calloc(sizeof(*wit_u->pek_u)); wit_u->pek_u->ptr_v = pic_u->ptr_v; @@ -925,7 +925,7 @@ u3_lord_peek(u3_lord* god_u, u3_pico* pic_u) // XX cache check, unless last // - lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); } /* u3_lord_play(): recompute batch. @@ -933,7 +933,7 @@ u3_lord_peek(u3_lord* god_u, u3_pico* pic_u) void u3_lord_play(u3_lord* god_u, u3_info fon_u) { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_play; wit_u->fon_u = fon_u; @@ -941,7 +941,7 @@ u3_lord_play(u3_lord* god_u, u3_info fon_u) // // u3_assert( !pay_u.ent_u->nex_u ); - lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); } /* u3_lord_work(): attempt work. @@ -949,7 +949,7 @@ u3_lord_play(u3_lord* god_u, u3_info fon_u) void u3_lord_work(u3_lord* god_u, u3_ovum* egg_u, u3_noun job) { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_work; wit_u->wok_u.egg_u = egg_u; wit_u->wok_u.job = job; @@ -963,7 +963,7 @@ u3_lord_work(u3_lord* god_u, u3_ovum* egg_u, u3_noun job) god_u->pin_o = c3y; } - lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); } /* u3_lord_save(): save a snapshot. @@ -975,9 +975,9 @@ u3_lord_save(u3_lord* god_u) return c3n; } else { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_save; - lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); return c3y; } } @@ -991,9 +991,9 @@ u3_lord_cram(u3_lord* god_u) return c3n; } else { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_cram; - lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); return c3y; } } @@ -1003,9 +1003,9 @@ u3_lord_cram(u3_lord* god_u) void u3_lord_meld(u3_lord* god_u) { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_meld; - lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); } /* u3_lord_pack(): defragment persistent state. @@ -1013,9 +1013,9 @@ u3_lord_meld(u3_lord* god_u) void u3_lord_pack(u3_lord* god_u) { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_pack; - lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); } /* u3_lord_exit(): shutdown gracefully. @@ -1023,9 +1023,9 @@ u3_lord_pack(u3_lord* god_u) void u3_lord_exit(u3_lord* god_u) { - u3_writ* wit_u = _lord_writ_new(god_u); + u3_writ* wit_u = u3_lord_writ_new(god_u); wit_u->typ_e = u3_writ_exit; - lord_writ_plan(god_u, wit_u); + u3_lord_writ_plan(god_u, wit_u); // XX set timer, then halt } diff --git a/pkg/vere/serf.c b/pkg/vere/serf.c index bf0d4ae949..fd28f47f08 100644 --- a/pkg/vere/serf.c +++ b/pkg/vere/serf.c @@ -78,8 +78,12 @@ u3_noun _serf_quac(u3m_quac* mas_u) { u3_noun list = u3_nul; - for ( c3_w i_w = 0; i_w < mas_u->len_w; i_w++ ) { - list = u3nc(_serf_quac(mas_u->qua_u[i_w]), list); + c3_w i_w = 0; + if ( mas_u->qua_u != NULL ) { + while ( mas_u->qua_u[i_w] != NULL ) { + list = u3nc(_serf_quac(mas_u->qua_u[i_w]), list); + i_w++; + } } list = u3kb_flop(list); @@ -95,11 +99,13 @@ _serf_quac(u3m_quac* mas_u) /* _serf_quacs: convert an array of quacs to a noun list. */ u3_noun -_serf_quacs(c3_w len_w, u3m_quac** all_u) +_serf_quacs(u3m_quac** all_u) { u3_noun list = u3_nul; - for ( c3_w i_w = 0; i_w < len_w; i_w++ ) { + c3_w i_w = 0; + while ( all_u[i_w] != NULL ) { list = u3nc(_serf_quac(all_u[i_w]), list); + i_w++; } c3_free(all_u); return u3kb_flop(list); @@ -108,10 +114,13 @@ _serf_quacs(c3_w len_w, u3m_quac** all_u) /* _serf_print_quacs: print an array of quacs. */ void -_serf_print_quacs(FILE* fil_u, c3_w len_w, u3m_quac** all_u) +_serf_print_quacs(FILE* fil_u, u3m_quac** all_u) { - for ( c3_w i_w = 0; i_w < len_w; i_w++) { + fprintf(fil_u, "\r\n"); + c3_w i_w = 0; + while ( all_u[i_w] != NULL ) { u3a_print_quac(fil_u, 0, all_u[i_w]); + i_w++; } } @@ -160,12 +169,13 @@ _serf_grab(u3_noun sac, c3_o pri_o) u3_assert( u3R == &(u3H->rod_u) ); u3m_quac* pro_u = u3a_prof(fil_u, sac); + if ( NULL == pro_u ) { fflush(fil_u); u3z(sac); return u3_nul; } else { - u3m_quac** all_u = c3_malloc(sizeof(*all_u)*9); + u3m_quac** all_u = c3_malloc(sizeof(*all_u) * 10); all_u[0] = pro_u; u3m_quac** var_u = u3m_mark(); @@ -180,7 +190,7 @@ _serf_grab(u3_noun sac, c3_o pri_o) all_u[5] = c3_calloc(sizeof(*all_u[5])); all_u[5]->nam_c = strdup("space profile"); - all_u[5]->siz_w = u3a_mark_noun(sac)*4; + all_u[5]->siz_w = u3a_mark_noun(sac) * 4; tot_w += all_u[5]->siz_w; @@ -190,14 +200,16 @@ _serf_grab(u3_noun sac, c3_o pri_o) all_u[7] = c3_calloc(sizeof(*all_u[7])); all_u[7]->nam_c = strdup("free lists"); - all_u[7]->siz_w = u3a_idle(u3R)*4; + all_u[7]->siz_w = u3a_idle(u3R) * 4; all_u[8] = c3_calloc(sizeof(*all_u[8])); all_u[8]->nam_c = strdup("sweep"); - all_u[8]->siz_w = u3a_sweep()*4; + all_u[8]->siz_w = u3a_sweep() * 4; + + all_u[9] = NULL; if ( c3y == pri_o ) { - _serf_print_quacs(fil_u, 9, all_u); + _serf_print_quacs(fil_u, all_u); } fflush(fil_u); @@ -207,7 +219,7 @@ _serf_grab(u3_noun sac, c3_o pri_o) } #endif - u3_noun mas = _serf_quacs(9, all_u); + u3_noun mas = _serf_quacs( all_u); u3z(sac); return mas; @@ -252,7 +264,6 @@ u3_serf_grab(c3_o pri_o) u3z(gon); } - fprintf(stderr, "serf: measuring memory:\r\n"); if ( u3_nul != sac ) { res = _serf_grab(sac, pri_o); } @@ -260,16 +271,16 @@ u3_serf_grab(c3_o pri_o) fprintf(stderr, "sac is empty\r\n"); u3m_quac** var_u = u3m_mark(); - c3_w tot_w; - tot_w = var_u[0]->siz_w + var_u[1]->siz_w - + var_u[2]->siz_w + var_u[3]->siz_w; - - for ( c3_w i_w = 0; i_w < 4; i_w++ ) { + c3_w tot_w = 0; + c3_w i_w = 0; + while ( var_u[i_w] != NULL ) { + tot_w += var_u[i_w]->siz_w; u3a_quac_free(var_u[i_w]); + i_w++; } c3_free(var_u); - u3a_print_memory(stderr, "total marked", tot_w/4); + u3a_print_memory(stderr, "total marked", tot_w / 4); u3a_print_memory(stderr, "free lists", u3a_idle(u3R)); u3a_print_memory(stderr, "sweep", u3a_sweep()); fprintf(stderr, "\r\n"); diff --git a/pkg/vere/vere.h b/pkg/vere/vere.h index 9d4cfdb5aa..59644518f5 100644 --- a/pkg/vere/vere.h +++ b/pkg/vere/vere.h @@ -770,10 +770,15 @@ u3_atom u3_time_t_in_ts(time_t tim); #endif - /* lord_writ_plan(): enqueue a writ and send. + /* u3_lord_writ_new(): allocate a new writ. + */ + u3_writ* + u3_lord_writ_new(u3_lord* god_u); + + /* u3_lord_writ_plan(): enqueue a writ and send. */ void - lord_writ_plan(u3_lord* god_u, u3_writ* wit_u); + u3_lord_writ_plan(u3_lord* god_u, u3_writ* wit_u); /* u3_time_out_ts(): struct timespec from urbit time. */ From b68c670e533b948b99ffd9d777dde503593c09e9 Mon Sep 17 00:00:00 2001 From: Tinnus Napbus Date: Tue, 16 Jul 2024 18:44:11 +1200 Subject: [PATCH 53/97] mass: include loom size in report --- pkg/vere/serf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pkg/vere/serf.c b/pkg/vere/serf.c index fd28f47f08..c410212db9 100644 --- a/pkg/vere/serf.c +++ b/pkg/vere/serf.c @@ -175,7 +175,7 @@ _serf_grab(u3_noun sac, c3_o pri_o) u3z(sac); return u3_nul; } else { - u3m_quac** all_u = c3_malloc(sizeof(*all_u) * 10); + u3m_quac** all_u = c3_malloc(sizeof(*all_u) * 11); all_u[0] = pro_u; u3m_quac** var_u = u3m_mark(); @@ -205,8 +205,12 @@ _serf_grab(u3_noun sac, c3_o pri_o) all_u[8] = c3_calloc(sizeof(*all_u[8])); all_u[8]->nam_c = strdup("sweep"); all_u[8]->siz_w = u3a_sweep() * 4; + + all_u[9] = c3_calloc(sizeof(*all_u[9])); + all_u[9]->nam_c = strdup("loom"); + all_u[9]->siz_w = u3C.wor_i * 4; - all_u[9] = NULL; + all_u[10] = NULL; if ( c3y == pri_o ) { _serf_print_quacs(fil_u, all_u); From 8d567d423f680e306e48137735d6d0355b72f218 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Tue, 16 Jul 2024 18:21:26 -0400 Subject: [PATCH 54/97] http: additional range spec compliance --- pkg/vere/io/http.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index c8496e90d0..19367144e6 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -732,7 +732,9 @@ _slice_mime(range_request rng, u3_noun octs) else { // [~ @] if ( rng.end_z > len_w ) { - return out; + // -9000/42 + out.beg_z = 0; + out.end_z = len_w - 1; } else { // slice last bytes @@ -752,7 +754,9 @@ _slice_mime(range_request rng, u3_noun octs) } } else if (rng.end_z > len_w) { - return out; + // 12-9000/42 + out.beg_z = rng.beg_z; + out.end_z = len_w - 1; } else { // [@ @] @@ -944,6 +948,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) u3_weak nac = u3h_get(htd_u->nax_p, bem); if ( u3_none == nac ) { + // cache, then serve subsequent range requests from cache req_u->peq_u->las_o = c3n; req_u->peq_u->pax = u3k(bem); u3_pier_peek(htd_u->car_u.pir_u, gang, u3k(u3nt(0, c3__ex, bem)), From 7988ca9d7d45cf6caa154af6be23b58fe64e2b86 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Wed, 17 Jul 2024 23:15:24 -0400 Subject: [PATCH 55/97] http: cleanup slice_mime --- pkg/vere/io/http.c | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 19367144e6..8170d07ef3 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -731,41 +731,23 @@ _slice_mime(range_request rng, u3_noun octs) } else { // [~ @] - if ( rng.end_z > len_w ) { - // -9000/42 - out.beg_z = 0; - out.end_z = len_w - 1; - } - else { - // slice last bytes - out.beg_z = len_w - rng.end_z; - out.end_z = len_w - 1; - } + out.beg_z = len_w - c3_min(rng.end_z, len_w); + out.end_z = len_w - 1; } } else if ( SIZE_MAX == rng.end_z ) { // [@ ~] - if ( rng.beg_z > len_w ) { - return out; - } - else { - out.beg_z = rng.beg_z; - out.end_z = len_w - 1; - } - } - else if (rng.end_z > len_w) { - // 12-9000/42 out.beg_z = rng.beg_z; out.end_z = len_w - 1; } else { // [@ @] out.beg_z = rng.beg_z; - out.end_z = rng.end_z; + out.end_z = c3_min(rng.end_z, len_w - 1); } - if ( (out.beg_z < len_w) - && (out.end_z < len_w) - && (out.beg_z <= out.end_z) ) + if ( (out.beg_z < len_w) + && (out.end_z < len_w) + && (out.beg_z <= out.end_z) ) { out.dat = u3nc((out.end_z - out.beg_z) + 1, u3qc_cut(3, out.beg_z, (out.end_z + 1) - out.beg_z, oct_w)); @@ -795,7 +777,7 @@ _parse_range(c3_c* txt_c, c3_w len_w) return cut; } -/* _http_req_dispatch(): dispatch http request to %eyre +/* _http_req_dispatch(): dispatch http request */ static void _http_req_dispatch(u3_hreq* req_u, u3_noun req) @@ -823,6 +805,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) : u3nc(u3i_string("request"), dat); } + // XX make _is_http_req function if ( (len_w >= 5) && ('_' == bas_c[1]) && ('~' == bas_c[2]) @@ -955,6 +938,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) req_u->peq_u, _http_foo_cb); } else { + // XX gang / auth h2o_headers_t req_headers = req_u->rec_u->headers; c3_w idx = h2o_find_header(&req_headers, H2O_TOKEN_RANGE, -1); @@ -1008,6 +992,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) } else { + // XX move to first branch // inject to arvo u3_auto_plan(&htd_u->car_u, u3_ovum_init(0, c3__e, wir, cad)); } From 7bf5043d828125994574c18b0f29f54c5ecfad3b Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Thu, 18 Jul 2024 13:10:50 -0400 Subject: [PATCH 56/97] http: auth & cache --- pkg/vere/io/http.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 8170d07ef3..c74e3a7741 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -694,8 +694,10 @@ _http_foo_cb(void* vod_p, u3_noun nun) } } - // cache only if peek was not at now - if ( c3n == peq_u->las_o ) { + // cache only if peek was not at now, and nun isn't u3_nul + if ( (c3n == peq_u->las_o) + && (u3_nul != nun) ) + { u3h_put(htd_u->nax_p, peq_u->pax, nun); } u3z(peq_u->pax); @@ -930,15 +932,18 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) u3_noun bem = u3nq(our, des, cas, spur); u3_weak nac = u3h_get(htd_u->nax_p, bem); - if ( u3_none == nac ) { - // cache, then serve subsequent range requests from cache + if ( (u3_none == nac) + || (u3_nul == nac) + || ((u3_nul == gang) && (c3y == u3r_at(14, nac))) ) + { + // maybe cache, then serve subsequent range requests from cache req_u->peq_u->las_o = c3n; req_u->peq_u->pax = u3k(bem); u3_pier_peek(htd_u->car_u.pir_u, gang, u3k(u3nt(0, c3__ex, bem)), req_u->peq_u, _http_foo_cb); } else { - // XX gang / auth + h2o_headers_t req_headers = req_u->rec_u->headers; c3_w idx = h2o_find_header(&req_headers, H2O_TOKEN_RANGE, -1); From 77bdfbc0993a9214aecd0f281471dd2ab9cfa364 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Thu, 18 Jul 2024 22:38:06 -0400 Subject: [PATCH 57/97] http: better slice_mime --- pkg/vere/io/http.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index c74e3a7741..b40261702f 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -726,27 +726,29 @@ _slice_mime(range_request rng, u3_noun octs) out.end_z = SIZE_MAX; out.dat = u3_nul; - if ( SIZE_MAX == rng.beg_z ) { - if ( SIZE_MAX == rng.end_z ) { - // [~ ~] - return out; - } - else { - // [~ @] - out.beg_z = len_w - c3_min(rng.end_z, len_w); - out.end_z = len_w - 1; - } + if ( (SIZE_MAX == rng.beg_z) + && (SIZE_MAX == rng.end_z) ) + { + // [~ ~] + return out; } - else if ( SIZE_MAX == rng.end_z ) { + + if ( SIZE_MAX == rng.end_z ) { // [@ ~] out.beg_z = rng.beg_z; out.end_z = len_w - 1; } + else if ( SIZE_MAX == rng.beg_z ) { + // [~ @] + out.beg_z = len_w - c3_min(rng.end_z, len_w); + out.end_z = len_w - 1; + } else { // [@ @] out.beg_z = rng.beg_z; out.end_z = c3_min(rng.end_z, len_w - 1); } + if ( (out.beg_z < len_w) && (out.end_z < len_w) && (out.beg_z <= out.end_z) ) From 1321e3f06d953e201edaf01c322204a4e68a55ac Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Fri, 19 Jul 2024 09:48:20 -0400 Subject: [PATCH 58/97] http: move heavier branches down --- pkg/vere/io/http.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index b40261702f..210b657802 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -809,13 +809,13 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) : u3nc(u3i_string("request"), dat); } - // XX make _is_http_req function - if ( (len_w >= 5) - && ('_' == bas_c[1]) - && ('~' == bas_c[2]) - && ('_' == bas_c[3]) - && ('/' == bas_c[4]) ) + if ( (len_w < 6) + || (0 != memcmp("/_~_/", bas_c, 5)) ) { + // inject to arvo + u3_auto_plan(&htd_u->car_u, u3_ovum_init(0, c3__e, wir, cad)); + } + else { bas_c = bas_c + 4; // retain '/' after /_~_ len_w = len_w - 4; @@ -949,7 +949,10 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) h2o_headers_t req_headers = req_u->rec_u->headers; c3_w idx = h2o_find_header(&req_headers, H2O_TOKEN_RANGE, -1); - if (idx != UINT32_MAX) { + if (idx == UINT32_MAX) { + _http_cache_respond(req_u, nac); + } + else { if ( (req_headers.entries[idx].value.len >= 6) && (0 == memcmp("bytes=", req_headers.entries[idx].value.base, 6)) ) { c3_w rest_len = req_headers.entries[idx].value.len - 6; @@ -990,19 +993,10 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) } } } - else { - _http_cache_respond(req_u, nac); - } } } } } - - else { - // XX move to first branch - // inject to arvo - u3_auto_plan(&htd_u->car_u, u3_ovum_init(0, c3__e, wir, cad)); - } } } From 8b6a86bc6626e35fc8cc48962a52111ce2b4e6c7 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Mon, 22 Jul 2024 13:55:15 -0400 Subject: [PATCH 59/97] http: cleanup, notes from review --- pkg/vere/io/http.c | 80 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 210b657802..39bbbe5538 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -671,11 +671,12 @@ _content_headers(c3_z beg_z, c3_z end_z, c3_w tot_w) return out; } -/* _http_foo_cb() +/* _http_scry_cb() */ static void -_http_foo_cb(void* vod_p, u3_noun nun) +_http_scry_cb(void* vod_p, u3_noun nun) { + // XX slice here u3_preq* peq_u = vod_p; u3_httd* htd_u = peq_u->htd_u; u3_hreq* req_u = peq_u->req_u; @@ -684,8 +685,16 @@ _http_foo_cb(void* vod_p, u3_noun nun) u3_assert(u3_rsat_peek == req_u->sat_e); req_u->peq_u = 0; if ( u3_nul != nun ) { - u3_atom len = u3r_at(254, nun); - u3_noun hez = _content_headers(0, (len - 1), len); + u3_noun len = u3r_at(254, nun); + c3_w len_w; + if ( c3n == u3r_safe_word(len, &len_w) ) { + u3z(nun); + _http_scry_respond(req_u, u3_nul); + u3z(peq_u->pax); + c3_free(peq_u); + return; + } + u3_noun hez = _content_headers(0, (len_w - 1), len_w); u3_noun res = u3i_edit(u3k(nun), 125, u3qb_weld(hez, u3r_at(125, nun))); _http_scry_respond(req_u, u3k(res)); } @@ -698,6 +707,8 @@ _http_foo_cb(void* vod_p, u3_noun nun) if ( (c3n == peq_u->las_o) && (u3_nul != nun) ) { + // XX pair of auth & path for key + // check ~watter-parter's u3h_put(htd_u->nax_p, peq_u->pax, nun); } u3z(peq_u->pax); @@ -712,20 +723,25 @@ typedef struct _range_request { typedef struct _content { c3_z beg_z; c3_z end_z; - u3_noun dat; + u3_noun dat; // XX free } content; +/* _slice_mime: given a valid range, slice a section of octs +*/ static content _slice_mime(range_request rng, u3_noun octs) { - c3_w len_w = u3h(octs); - c3_w oct_w = u3t(octs); content out; - out.beg_z = SIZE_MAX; out.end_z = SIZE_MAX; out.dat = u3_nul; + u3_noun len = u3h(octs); + c3_w len_w; + if ( c3n == u3r_safe_word(len, &len_w) ) { + return out; + } + if ( (SIZE_MAX == rng.beg_z) && (SIZE_MAX == rng.end_z) ) { @@ -754,11 +770,13 @@ _slice_mime(range_request rng, u3_noun octs) && (out.beg_z <= out.end_z) ) { out.dat = u3nc((out.end_z - out.beg_z) + 1, - u3qc_cut(3, out.beg_z, (out.end_z + 1) - out.beg_z, oct_w)); + u3qc_cut(3, out.beg_z, (out.end_z + 1) - out.beg_z, u3t(octs))); } return out; } +/* _parse_range: get a range from '-' delimited text +*/ static range_request _parse_range(c3_c* txt_c, c3_w len_w) { @@ -809,13 +827,15 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) : u3nc(u3i_string("request"), dat); } + // check if base url starts with '/_~_/' if ( (len_w < 6) || (0 != memcmp("/_~_/", bas_c, 5)) ) { - // inject to arvo + // no: inject to arvo u3_auto_plan(&htd_u->car_u, u3_ovum_init(0, c3__e, wir, cad)); } else { + // '/_~_/' found bas_c = bas_c + 4; // retain '/' after /_~_ len_w = len_w - 4; @@ -844,6 +864,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) u3_noun des; u3_noun cas; + // XX move to function // get beak from path // for ( c3_w i_w = 0; i_w < 3; ++i_w ) { @@ -907,17 +928,19 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) return; } else { - c3_d len_d = nex_c - bas_c; - *wer = u3i_bytes(len_d, (const c3_y*)bas_c); + c3_w dif_w = (c3_p)(nex_c - bas_c); + *wer = u3i_bytes(dif_w, (const c3_y*)bas_c); bas_c = nex_c; - len_w = len_w - len_d; + len_w = len_w - dif_w; } } } u3_noun spur = u3dc("rush", u3i_bytes(len_w, (const c3_y*)bas_c), u3v_wish("stap")); - if ( (who != our) || (u3_nul == spur) ) { + if ( (u3_nul == spur) + || (c3n == u3r_sing(our, who)) ) + { c3_c* msg_c = "bad scry path"; h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); return; @@ -927,7 +950,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) spur = u3nc(u3i_string("mime"), u3t(spur)); if ( c3y == req_u->peq_u->las_o ) { u3_pier_peek_last(htd_u->car_u.pir_u, gang, c3__ex, - des, spur, req_u->peq_u, _http_foo_cb); + des, spur, req_u->peq_u, _http_scry_cb); } else { @@ -939,13 +962,13 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) || ((u3_nul == gang) && (c3y == u3r_at(14, nac))) ) { // maybe cache, then serve subsequent range requests from cache - req_u->peq_u->las_o = c3n; - req_u->peq_u->pax = u3k(bem); - u3_pier_peek(htd_u->car_u.pir_u, gang, u3k(u3nt(0, c3__ex, bem)), - req_u->peq_u, _http_foo_cb); + req_u->peq_u->pax = bem; + u3_pier_peek(htd_u->car_u.pir_u, gang, u3nt(0, c3__ex, u3k(bem)), + req_u->peq_u, _http_scry_cb); } else { + // XX function h2o_headers_t req_headers = req_u->rec_u->headers; c3_w idx = h2o_find_header(&req_headers, H2O_TOKEN_RANGE, -1); @@ -959,27 +982,35 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) if ( 0 == rest_len ) { c3_c* msg_c = "Requested Range Not Satisfiable"; h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); + // XX leaks return; } range_request rng_req = _parse_range(req_headers.entries[idx].value.base + 6, rest_len); u3_noun octs = u3r_at(127, nac); if ( u3_none == octs ) { - c3_c* msg_c = "Requested Range Not Satisfiable"; - h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); + h2o_send_error_500(rec_u, "Internal Server Error", "scry failed", 0); return; } content result = _slice_mime(rng_req, octs); if ( u3_nul == result.dat ) { c3_c* msg_c = "Requested Range Not Satisfiable"; + u3z(result.dat); h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); return; } - if ( u3r_sing(result.dat, octs) == c3y) { + if ( c3y == u3r_sing(result.dat, octs) ) { // 200 - u3_atom len = u3r_at(254, nac); - u3_noun hez = _content_headers(0, (len - 1), len); + u3z(result.dat); + u3_noun len = u3r_at(254, nac); + c3_w len_w; + if ( c3n == u3r_safe_word(len, &len_w) ) { + u3z(nac); + _http_scry_respond(req_u, u3_nul); + return; + } + u3_noun hez = _content_headers(0, (len_w - 1), len_w); u3_noun res = u3i_edit(nac, 125, u3qb_weld(hez, u3r_at(125, nac))); _http_cache_respond(req_u, res); } @@ -1102,6 +1133,7 @@ _http_cache_scry_cb(void* vod_p, u3_noun nun) _http_cache_respond(req_u, u3k(nun)); } + // XX pair of auth & path for key u3h_put(htd_u->nax_p, peq_u->pax, nun); u3z(peq_u->pax); c3_free(peq_u); From 529f399a1ac1535767eb22ce0fe4328a2cc68b70 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Mon, 22 Jul 2024 16:33:39 -0500 Subject: [PATCH 60/97] Post with i754 mote. --- pkg/c3/motes.h | 1 + pkg/noun/jets/i/lagoon.c | 468 +++++++++++++++++++-------------------- pkg/noun/jets/q.h | 52 ++--- 3 files changed, 258 insertions(+), 263 deletions(-) diff --git a/pkg/c3/motes.h b/pkg/c3/motes.h index 277761185f..6c1217dc2f 100644 --- a/pkg/c3/motes.h +++ b/pkg/c3/motes.h @@ -613,6 +613,7 @@ # define c3__is c3_s2('i','s') # define c3__item c3_s4('i','t','e','m') # define c3__ix c3_s2('i','x') +# define c3__i754 c3_s4('i','7','5','4') # define c3__j c3_s1('j') # define c3__jack c3_s4('j','a','c','k') # define c3__jam c3_s3('j','a','m') diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index 460058ecc7..9ead70d14a 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -30,6 +30,16 @@ c3_d c[2]; }; +/* soft check on u3_none return from q jet +*/ + static inline u3_noun _soft_run(u3_noun a) + { + if (u3_none == a) { + u3m_bail(c3__fail); + } + return a; + } + // $?(%n %u %d %z %a) static inline void _set_rounding(c3_w a) @@ -98,20 +108,10 @@ return dims; } -/* soft check on u3_none return from q jet -*/ - static inline u3_noun _soft_run(u3_noun a) - { - if (u3_none == a) { - u3m_bail(c3__fail); - } - return a; - } - /* add - axpy = 1*x+y */ u3_noun - u3qi_la_add_real(u3_noun x_data, + u3qi_la_add_i754(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq @@ -121,6 +121,7 @@ if (bloq < 4 || bloq > 7) { return u3_none; } + fprintf(stderr, ">>> u3qi_la_add\n\r"); // Unpack the data as a byte array. We assume total length < 2**64. // len_x is length in base units @@ -135,8 +136,7 @@ // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); - u3r_bytes(0, syz_x, y_bytes, y_data); - y_bytes[syz_x] = 0x1; + u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. switch (u3x_atom(bloq)) { @@ -170,7 +170,7 @@ /* sub - axpy = -1*y+x */ u3_noun - u3qi_la_sub_real(u3_noun x_data, + u3qi_la_sub_i754(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq @@ -194,9 +194,8 @@ // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); - u3r_bytes(0, syz_x, y_bytes, y_data); - y_bytes[syz_x] = 0x1; - + u3r_bytes(0, syz_x+1, y_bytes, y_data); + // Switch on the block size. switch (u3x_atom(bloq)) { case 4: @@ -231,7 +230,7 @@ elementwise multiplication */ u3_noun - u3qi_la_mul_real(u3_noun x_data, + u3qi_la_mul_i754(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -254,8 +253,7 @@ // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); - u3r_bytes(0, syz_x, y_bytes, y_data); - y_bytes[syz_x] = 0x1; + u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. switch (u3x_atom(bloq)) { @@ -298,7 +296,7 @@ elementwise division */ u3_noun - u3qi_la_div_real(u3_noun x_data, + u3qi_la_div_i754(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -321,8 +319,7 @@ // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); - u3r_bytes(0, syz_x, y_bytes, y_data); - y_bytes[syz_x] = 0x1; + u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. switch (u3x_atom(bloq)) { @@ -365,7 +362,7 @@ remainder after division */ u3_noun - u3qi_la_mod_real(u3_noun x_data, + u3qi_la_mod_i754(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -388,8 +385,7 @@ // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); - u3r_bytes(0, syz_x, y_bytes, y_data); - y_bytes[syz_x] = 0x1; + u3r_bytes(0, syz_x+1, y_bytes, y_data); // Switch on the block size. switch (u3x_atom(bloq)) { @@ -474,7 +470,7 @@ /* cumsum - x[0] + x[1] + ... x[n] */ u3_noun - u3qi_la_cumsum_real(u3_noun x_data, + u3qi_la_cumsum_i754(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -498,7 +494,7 @@ // Switch on the block size. switch (u3x_atom(bloq)) { - case 4: ; + case 4: { float16_t sum16[2]; sum16[0] = (float16_t){SB_REAL16_ZERO}; for (c3_d i = 0; i < len_x; i++) { @@ -506,9 +502,9 @@ } sum16[1].v = 0x1; r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)sum16); - break; + break;} - case 5: ; + case 5: { float32_t sum32[2]; sum32[0] = (float32_t){SB_REAL32_ZERO}; for (c3_d i = 0; i < len_x; i++) { @@ -516,9 +512,9 @@ } sum32[1].v = 0x1; r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)sum32); - break; + break;} - case 6: ; + case 6: { float64_t sum64[2]; sum64[0] = (float64_t){SB_REAL64_ZERO}; for (c3_d i = 0; i < len_x; i++) { @@ -526,9 +522,9 @@ } sum64[1].v = 0x1; r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)sum64); - break; + break;} - case 7: ; + case 7: { float128_t sum128[2]; sum128[0] = (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; for (c3_d i = 0; i < len_x; i++) { @@ -536,7 +532,7 @@ } sum128[1] = (float128_t){0x1, 0x0}; r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)sum128); - break; + break;} } // Clean up and return. @@ -548,7 +544,7 @@ /* argmin - argmin(x) */ u3_noun - u3qi_la_argmin_real(u3_noun x_data, + u3qi_la_argmin_i754(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -579,8 +575,8 @@ min_val16 = ((float16_t*)x_bytes)[i]; min_idx = (len_x - i - 1); } - } } - break; + } + break;} case 5: { float32_t min_val32 = ((float32_t*)x_bytes)[0]; @@ -589,8 +585,8 @@ min_val32 = ((float32_t*)x_bytes)[i]; min_idx = (len_x - i - 1); } - } } - break; + } + break;} case 6: { float64_t min_val64 = ((float64_t*)x_bytes)[0]; @@ -599,8 +595,8 @@ min_val64 = ((float64_t*)x_bytes)[i]; min_idx = (len_x - i - 1); } - } } - break; + } + break;} case 7: { float128_t min_val128 = ((float128_t*)x_bytes)[0]; @@ -609,8 +605,8 @@ min_val128 = *f128M_min(&min_val128, &((float128_t*)x_bytes)[i]); min_idx = (len_x - i - 1); } - } } - break; + } + break;} } u3_noun r_data = u3i_chub(min_idx); @@ -621,7 +617,7 @@ /* argmax - argmax(x) */ u3_noun - u3qi_la_argmax_real(u3_noun x_data, + u3qi_la_argmax_i754(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -652,8 +648,8 @@ max_val16 = ((float16_t*)x_bytes)[i]; max_idx = (len_x - i - 1); } - } } - break; + } + break;} case 5: { float32_t max_val32 = ((float32_t*)x_bytes)[0]; @@ -662,8 +658,8 @@ max_val32 = ((float32_t*)x_bytes)[i]; max_idx = (len_x - i - 1); } - } } - break; + } + break;} case 6: { float64_t max_val64 = ((float64_t*)x_bytes)[0]; @@ -672,8 +668,8 @@ max_val64 = ((float64_t*)x_bytes)[i]; max_idx = (len_x - i - 1); } - } } - break; + } + break;} case 7: { float128_t max_val128 = ((float128_t*)x_bytes)[0]; @@ -682,8 +678,8 @@ max_val128 = *f128M_max(&max_val128, &((float128_t*)x_bytes)[i]); max_idx = (len_x - i - 1); } - } } - break; + } + break;} } u3_noun r_data = u3i_chub(max_idx); @@ -695,7 +691,7 @@ entire nd-array busted out as a linear list */ u3_noun - u3qi_la_ravel_real(u3_noun x_data, + u3qi_la_ravel_i754(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -758,7 +754,7 @@ /* min - min(x,y) */ u3_noun - u3qi_la_min_real(u3_noun x_data, + u3qi_la_min_i754(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -782,7 +778,7 @@ // Switch on the block size. switch (u3x_atom(bloq)) { - case 4: ; + case 4: { float16_t min_val16 = ((float16_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { min_val16 = f16_min(min_val16, ((float16_t*)x_bytes)[i]); @@ -791,9 +787,9 @@ r16[0] = min_val16; r16[1].v = 0x1; r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)r16); - break; + break;} - case 5: ; + case 5: { float32_t min_val32 = ((float32_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { min_val32 = f32_min(min_val32, ((float32_t*)x_bytes)[i]); @@ -802,9 +798,9 @@ r32[0] = min_val32; r32[1].v = 0x1; r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)r32); - break; + break;} - case 6: ; + case 6: { float64_t min_val64 = ((float64_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { min_val64 = f64_min(min_val64, ((float64_t*)x_bytes)[i]); @@ -813,9 +809,9 @@ r64[0] = min_val64; r64[1].v = 0x1; r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)r64); - break; + break;} - case 7: ; + case 7: { float128_t min_val128 = ((float128_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { min_val128 = *f128M_min(&min_val128, &((float128_t*)x_bytes)[i]); @@ -824,7 +820,7 @@ r128[0] = min_val128; r128[1] = (float128_t){0x1, 0x0}; r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)r128); - break; + break;} } // Clean up and return. @@ -836,7 +832,7 @@ /* max - max(x,y) */ u3_noun - u3qi_la_max_real(u3_noun x_data, + u3qi_la_max_i754(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -860,7 +856,7 @@ // Switch on the block size. switch (u3x_atom(bloq)) { - case 4: ; + case 4: { float16_t max_val16 = ((float16_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { max_val16 = f16_max(max_val16, ((float16_t*)x_bytes)[i]); @@ -869,9 +865,9 @@ r16[0] = max_val16; r16[1].v = 0x1; r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)r16); - break; + break;} - case 5: ; + case 5: { float32_t max_val32 = ((float32_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { max_val32 = f32_max(max_val32, ((float32_t*)x_bytes)[i]); @@ -880,9 +876,9 @@ r32[0] = max_val32; r32[1].v = 0x1; r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)r32); - break; + break;} - case 6: ; + case 6: { float64_t max_val64 = ((float64_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { max_val64 = f64_max(max_val64, ((float64_t*)x_bytes)[i]); @@ -891,9 +887,9 @@ r64[0] = max_val64; r64[1].v = 0x1; r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)r64); - break; + break;} - case 7: ; + case 7: { float128_t max_val128 = ((float128_t*)x_bytes)[0]; for (c3_d i = 0; i < len_x; i++) { max_val128 = *f128M_max(&max_val128, &((float128_t*)x_bytes)[i]); @@ -902,7 +898,7 @@ r128[0] = max_val128; r128[1] = (float128_t){0x1, 0x0}; r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)r128); - break; + break;} } // Clean up and return. @@ -914,7 +910,7 @@ /* abs - |x| */ u3_noun - u3qi_la_abs_real(u3_noun x_data, + u3qi_la_abs_i754(u3_noun x_data, u3_noun shape, u3_noun bloq) { @@ -973,7 +969,7 @@ /* gth - x > y */ u3_noun - u3qi_la_gth_real(u3_noun x_data, + u3qi_la_gth_i754(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -1046,7 +1042,7 @@ /* gte - x > y */ u3_noun - u3qi_la_gte_real(u3_noun x_data, + u3qi_la_gte_i754(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -1119,7 +1115,7 @@ /* lth - x > y */ u3_noun - u3qi_la_lth_real(u3_noun x_data, + u3qi_la_lth_i754(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -1192,7 +1188,7 @@ /* lte - x > y */ u3_noun - u3qi_la_lte_real(u3_noun x_data, + u3qi_la_lte_i754(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -1265,7 +1261,7 @@ /* adds - axpy = 1*x+[n] */ u3_noun - u3qi_la_adds_real(u3_noun x_data, + u3qi_la_adds_i754(u3_noun x_data, u3_noun n, u3_noun shape, u3_noun bloq) @@ -1347,7 +1343,7 @@ /* subs - axpy = -1*[n]+x */ u3_noun - u3qi_la_subs_real(u3_noun x_data, + u3qi_la_subs_i754(u3_noun x_data, u3_noun n, u3_noun shape, u3_noun bloq) @@ -1430,7 +1426,7 @@ elementwise multiplication */ u3_noun - u3qi_la_muls_real(u3_noun x_data, + u3qi_la_muls_i754(u3_noun x_data, u3_noun n, u3_noun shape, u3_noun bloq) @@ -1493,7 +1489,7 @@ elementwise division */ u3_noun - u3qi_la_divs_real(u3_noun x_data, + u3qi_la_divs_i754(u3_noun x_data, u3_noun n, u3_noun shape, u3_noun bloq) @@ -1564,7 +1560,7 @@ remainder after scalar division */ u3_noun - u3qi_la_mods_real(u3_noun x_data, + u3qi_la_mods_i754(u3_noun x_data, u3_noun n, u3_noun shape, u3_noun bloq) @@ -1681,7 +1677,7 @@ /* dot - ?dot = x · y */ u3_noun - u3qi_la_dot_real(u3_noun x_data, + u3qi_la_dot_i754(u3_noun x_data, u3_noun y_data, u3_noun shape, u3_noun bloq) @@ -1710,33 +1706,33 @@ // Switch on the block size. switch (u3x_atom(bloq)) { - case 4: ; + case 4: { float16_t r16[2]; r16[0] = hdot(len_x, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); r16[1].v = 0x1; r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)r16); - break; + break;} - case 5: ; + case 5: { float32_t r32[2]; r32[0] = sdot(len_x, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); r32[1].v = 0x1; r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)r32); - break; + break;} - case 6: ; + case 6: { float64_t r64[2]; r64[0] = ddot(len_x, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); r64[1].v = 0x1; r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)r64); - break; + break;} - case 7: ; + case 7: { float128_t r128[2]; r128[0] = qdot(len_x, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); r128[1] = (float128_t){0x1, 0x0}; r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)r128); - break; + break;} } // Clean up and return. @@ -1759,12 +1755,12 @@ } // Assert length of dims is 2. if (u3qb_lent(shape) != 2) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } // Unpack shape into an array of dimensions. c3_d *dims = _get_dims(shape); if (dims[0] != dims[1]) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } // Unpack the data as a byte array. We assume total length < 2**64. @@ -1806,7 +1802,7 @@ { // Assert length of dims is 2. if (u3qb_lent(shape) != 2) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } // Unpack shape into an array of dimensions. c3_d *dims = _get_dims(shape); @@ -1845,7 +1841,7 @@ /* linspace - [a a+(b-a)/n ... b] */ u3_noun - u3qi_la_linspace_real(u3_noun a, + u3qi_la_linspace_i754(u3_noun a, u3_noun b, u3_noun n, u3_noun bloq) @@ -1858,7 +1854,7 @@ u3_noun r_data; switch (u3x_atom(bloq)) { - case 4: ; + case 4: { float16_t a16, b16; u3r_bytes(0, 2, (c3_y*)&(a16.v), a); u3r_bytes(0, 2, (c3_y*)&(b16.v), b); @@ -1873,9 +1869,9 @@ x_bytes16[(n+1)*2] = 0x1; // pin head r_data = u3i_bytes(((n+1)*2+1)*sizeof(c3_y), x_bytes16); u3a_free(x_bytes16); - break; + break;} - case 5: ; + case 5: { float32_t a32, b32; u3r_bytes(0, 4, (c3_y*)&(a32.v), a); u3r_bytes(0, 4, (c3_y*)&(b32.v), b); @@ -1890,9 +1886,9 @@ x_bytes32[(n+1)*4] = 0x1; // pin head r_data = u3i_bytes(((n+1)*4+1)*sizeof(c3_y), x_bytes32); u3a_free(x_bytes32); - break; + break;} - case 6: ; + case 6: { float64_t a64, b64; u3r_bytes(0, 8, (c3_y*)&(a64.v), a); u3r_bytes(0, 8, (c3_y*)&(b64.v), b); @@ -1907,9 +1903,9 @@ x_bytes64[(n+1)*8] = 0x1; // pin head r_data = u3i_bytes(((n+1)*8+1)*sizeof(c3_y), x_bytes64); u3a_free(x_bytes64); - break; + break;} - case 7: ; + case 7: { float128_t a128, b128; u3r_bytes(0, 16, (c3_y*)&(a128.v[0]), a); u3r_bytes(0, 16, (c3_y*)&(b128.v[0]), b); @@ -1931,7 +1927,7 @@ x_bytes128[(n+1)*16] = 0x1; // pin head r_data = u3i_bytes(((n+1)*16+1)*sizeof(c3_y), x_bytes128); u3a_free(x_bytes128); - break; + break;} } return r_data; @@ -1940,7 +1936,7 @@ /* range - [a a+d ... b] */ u3_noun - u3qi_la_range_real(u3_noun a, + u3qi_la_range_i754(u3_noun a, u3_noun b, u3_noun d, u3_noun bloq) @@ -1953,7 +1949,7 @@ u3_noun r_data; switch (u3x_atom(bloq)) { - case 4: ; + case 4: { float16_t a16, b16, interval16; u3r_bytes(0, 2, (c3_y*)&(a16.v), a); u3r_bytes(0, 2, (c3_y*)&(b16.v), b); @@ -1968,9 +1964,9 @@ x_bytes16[(n16+1)*2] = 0x1; // pin head r_data = u3i_bytes(((n16+1)*2+1)*sizeof(c3_y), x_bytes16); u3a_free(x_bytes16); - break; + break;} - case 5: ; + case 5: { float32_t a32, b32, interval32; u3r_bytes(0, 4, (c3_y*)&(a32.v), a); u3r_bytes(0, 4, (c3_y*)&(b32.v), b); @@ -1985,9 +1981,9 @@ x_bytes32[(n32+1)*4] = 0x1; // pin head r_data = u3i_bytes(((n32+1)*4+1)*sizeof(c3_y), x_bytes32); u3a_free(x_bytes32); - break; + break;} - case 6: ; + case 6: { float64_t a64, b64, interval64; u3r_bytes(0, 8, (c3_y*)&(a64.v), a); u3r_bytes(0, 8, (c3_y*)&(b64.v), b); @@ -2002,9 +1998,9 @@ x_bytes64[(n64+1)*8] = 0x1; // pin head r_data = u3i_bytes(((n64+1)*8+1)*sizeof(c3_y), x_bytes64); u3a_free(x_bytes64); - break; + break;} - case 7: ; + case 7: { float128_t a128, b128, interval128; u3r_bytes(0, 16, (c3_y*)&(a128.v[0]), a); u3r_bytes(0, 16, (c3_y*)&(b128.v[0]), b); @@ -2025,7 +2021,7 @@ x_bytes128[(n128+1)*16] = 0x1; // pin head r_data = u3i_bytes(((n128+1)*16+1)*sizeof(c3_y), x_bytes128); u3a_free(x_bytes128); - break; + break;} } return r_data; @@ -2034,20 +2030,20 @@ /* trace - tr(x) */ u3_noun - u3qi_la_trace_real(u3_noun x_data, + u3qi_la_trace_i754(u3_noun x_data, u3_noun shape, u3_noun bloq) { u3_noun d_data = u3qi_la_diag(x_data, shape, bloq); c3_d len_x0 = _get_dims(shape)[0]; - u3_noun r_data = u3qi_la_dot_real(d_data, d_data, u3nt(len_x0, 0x1, u3_nul), u3k(bloq)); + u3_noun r_data = u3qi_la_dot_i754(d_data, d_data, u3nt(len_x0, 0x1, u3_nul), u3k(bloq)); return r_data; } /* mmul */ u3_noun - u3qi_la_mmul_real(u3_noun x_data, + u3qi_la_mmul_i754(u3_noun x_data, u3_noun y_data, u3_noun x_shape, u3_noun y_shape, @@ -2059,15 +2055,10 @@ c3_d Nb= u3x_atom(u3h(y_shape)); c3_d P = u3x_atom(u3h(u3t(y_shape))); - // Fence on valid bloq size. - if (bloq < 4 || bloq > 7) { - return u3_none; - } - if ((u3_nul != u3t(u3t(x_shape))) || (u3_nul != u3t(u3t(y_shape))) || (Na != Nb)) { - return u3_none; + return u3m_bail(c3__exit); } c3_d N = Na; @@ -2134,7 +2125,7 @@ u3a_free(y_bytes); u3a_free(r_bytes); - return u3nc(u3nq(u3nt(M_, P_, u3_nul), u3k(bloq), c3__real, u3_nul), r_data); + return u3nc(u3nq(u3nt(M_, P_, u3_nul), u3k(bloq), c3__i754, u3_nul), r_data); } u3_noun @@ -2143,6 +2134,7 @@ // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, y_meta, y_data; + fprintf(stderr, "> u3wi_la_add\n\r"); if ( c3n == u3r_mean(cor, u3x_sam_4, &x_meta, @@ -2153,7 +2145,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2177,12 +2169,14 @@ // fxp does not need to match here so no check ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { + fprintf(stderr, "> u3wi_la_add\n\r"); switch (x_kind) { - case c3__real: + case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_add_real(x_data, y_data, x_shape, x_bloq)); + fprintf(stderr, ">> u3wi_la_add\n\r"); + u3_noun r_data = _soft_run(u3qi_la_add_i754(x_data, y_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2208,7 +2202,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2232,12 +2226,12 @@ // fxp does not need to match here so no check ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: + case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_sub_real(x_data, y_data, x_shape, x_bloq)); + u3_noun r_data = _soft_run(u3qi_la_sub_i754(x_data, y_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2263,7 +2257,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2287,12 +2281,12 @@ // fxp does not need to match here so no check ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: + case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_mul_real(x_data, y_data, x_shape, x_bloq)); + u3_noun r_data = _soft_run(u3qi_la_mul_i754(x_data, y_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2318,7 +2312,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2342,12 +2336,12 @@ // fxp does not need to match here so no check ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: + case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_div_real(x_data, y_data, x_shape, x_bloq)); + u3_noun r_data = _soft_run(u3qi_la_div_i754(x_data, y_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2373,7 +2367,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2397,12 +2391,12 @@ // fxp does not need to match here so no check ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: + case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_mod_real(x_data, y_data, x_shape, x_bloq)); + u3_noun r_data = _soft_run(u3qi_la_mod_i754(x_data, y_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2424,7 +2418,7 @@ 0) || c3n == u3ud(x_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; @@ -2437,12 +2431,12 @@ c3n == u3ud(x_kind) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: + case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_cumsum_real(x_data, x_shape, x_bloq)); + u3_noun r_data = _soft_run(u3qi_la_cumsum_i754(x_data, x_shape, x_bloq)); return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2464,7 +2458,7 @@ 0) || c3n == u3ud(x_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -2475,13 +2469,13 @@ c3n == u3ud(x_kind) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: ; - u3_noun r_data = u3qi_la_argmin_real(x_data, x_shape, x_bloq); + case c3__i754: { + u3_noun r_data = _soft_run(u3qi_la_argmin_i754(x_data, x_shape, x_bloq)); // bare atom (@ index) - return r_data; + return r_data;} default: return u3_none; @@ -2502,7 +2496,7 @@ 0) || c3n == u3ud(x_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -2512,13 +2506,13 @@ c3n == u3ud(x_kind) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: ; - u3_noun r_data = _soft_run(u3qi_la_ravel_real(x_data, x_shape, x_bloq)); + case c3__i754: { + u3_noun r_data = _soft_run(u3qi_la_ravel_i754(x_data, x_shape, x_bloq)); // (list @) - return r_data; + return r_data;} default: return u3_none; @@ -2539,7 +2533,7 @@ 0) || c3n == u3ud(x_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -2550,13 +2544,13 @@ c3n == u3ud(x_kind) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: ; - u3_noun r_data = u3qi_la_argmax_real(x_data, x_shape, x_bloq); + case c3__i754: { + u3_noun r_data = _soft_run(u3qi_la_argmax_i754(x_data, x_shape, x_bloq)); // bare atom (@ index) - return r_data; + return r_data;} default: return u3_none; @@ -2577,7 +2571,7 @@ 0) || c3n == u3ud(x_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -2588,12 +2582,12 @@ c3n == u3ud(x_kind) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: ; - u3_noun r_data = _soft_run(u3qi_la_min_real(x_data, x_shape, x_bloq)); - return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + case c3__i754: { + u3_noun r_data = _soft_run(u3qi_la_min_i754(x_data, x_shape, x_bloq)); + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: return u3_none; @@ -2614,7 +2608,7 @@ 0) || c3n == u3ud(x_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -2625,12 +2619,12 @@ c3n == u3ud(x_kind) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: ; - u3_noun r_data = _soft_run(u3qi_la_max_real(x_data, x_shape, x_bloq)); - return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + case c3__i754: { + u3_noun r_data = _soft_run(u3qi_la_max_i754(x_data, x_shape, x_bloq)); + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: return u3_none; @@ -2651,7 +2645,7 @@ 0) || c3n == u3ud(x_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -2662,12 +2656,12 @@ c3n == u3ud(x_kind) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: ; - u3_noun r_data = _soft_run(u3qi_la_abs_real(x_data, x_shape, x_bloq)); - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + case c3__i754: { + u3_noun r_data = _soft_run(u3qi_la_abs_i754(x_data, x_shape, x_bloq)); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: return u3_none; @@ -2692,7 +2686,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2715,12 +2709,12 @@ c3n == u3r_sing(x_fxp, y_fxp) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: ; - u3_noun r_data = _soft_run(u3qi_la_gth_real(x_data, y_data, x_shape, x_bloq)); - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + case c3__i754: { + u3_noun r_data = _soft_run(u3qi_la_gth_i754(x_data, y_data, x_shape, x_bloq)); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: return u3_none; @@ -2745,7 +2739,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2768,12 +2762,12 @@ c3n == u3r_sing(x_fxp, y_fxp) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: ; - u3_noun r_data = _soft_run(u3qi_la_gte_real(x_data, y_data, x_shape, x_bloq)); - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + case c3__i754: { + u3_noun r_data = _soft_run(u3qi_la_gte_i754(x_data, y_data, x_shape, x_bloq)); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: return u3_none; @@ -2798,7 +2792,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2821,12 +2815,12 @@ c3n == u3r_sing(x_fxp, y_fxp) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: ; - u3_noun r_data = _soft_run(u3qi_la_lth_real(x_data, y_data, x_shape, x_bloq)); - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + case c3__i754: { + u3_noun r_data = _soft_run(u3qi_la_lth_i754(x_data, y_data, x_shape, x_bloq)); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: return u3_none; @@ -2851,7 +2845,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -2874,12 +2868,12 @@ c3n == u3r_sing(x_fxp, y_fxp) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: ; - u3_noun r_data = _soft_run(u3qi_la_lte_real(x_data, y_data, x_shape, x_bloq)); - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + case c3__i754: { + u3_noun r_data = _soft_run(u3qi_la_lte_i754(x_data, y_data, x_shape, x_bloq)); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: return u3_none; @@ -2902,7 +2896,7 @@ c3n == u3ud(x_data) || c3n == u3ud(n) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; @@ -2912,9 +2906,9 @@ x_fxp = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 switch (x_kind) { - case c3__real: + case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_adds_real(x_data, n, x_shape, x_bloq)); + u3_noun r_data = _soft_run(u3qi_la_adds_i754(x_data, n, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2937,7 +2931,7 @@ c3n == u3ud(x_data) || c3n == u3ud(n) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; @@ -2947,9 +2941,9 @@ x_fxp = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 switch (x_kind) { - case c3__real: + case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_subs_real(x_data, n, x_shape, x_bloq)); + u3_noun r_data = _soft_run(u3qi_la_subs_i754(x_data, n, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2972,7 +2966,7 @@ c3n == u3ud(x_data) || c3n == u3ud(n) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; @@ -2982,9 +2976,9 @@ x_fxp = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 switch (x_kind) { - case c3__real: + case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_muls_real(x_data, n, x_shape, x_bloq)); + u3_noun r_data = _soft_run(u3qi_la_muls_i754(x_data, n, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -3007,7 +3001,7 @@ c3n == u3ud(x_data) || c3n == u3ud(n) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; @@ -3017,9 +3011,9 @@ x_fxp = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 switch (x_kind) { - case c3__real: + case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_divs_real(x_data, n, x_shape, x_bloq)); + u3_noun r_data = _soft_run(u3qi_la_divs_i754(x_data, n, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -3042,7 +3036,7 @@ c3n == u3ud(x_data) || c3n == u3ud(n) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, rnd; @@ -3052,9 +3046,9 @@ x_fxp = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 switch (x_kind) { - case c3__real: + case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_mods_real(x_data, n, x_shape, x_bloq)); + u3_noun r_data = _soft_run(u3qi_la_mods_i754(x_data, n, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -3079,7 +3073,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -3103,12 +3097,12 @@ c3n == u3r_sing(x_fxp, y_fxp) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: + case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_dot_real(x_data, y_data, x_shape, x_bloq)); + u3_noun r_data = _soft_run(u3qi_la_dot_i754(x_data, y_data, x_shape, x_bloq)); c3_d len_x0 = _get_dims(x_shape)[0]; return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); @@ -3131,7 +3125,7 @@ 0) || c3n == u3ud(x_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -3142,9 +3136,9 @@ c3n == u3ud(x_kind) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { - u3_noun r_data = u3qi_la_transpose(x_data, x_shape, x_bloq); + u3_noun r_data = _soft_run(u3qi_la_transpose(x_data, x_shape, x_bloq)); return u3nc(u3nq(u3nt(u3k(u3h(x_shape)), u3k(u3h(u3t(x_shape))), u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); } } @@ -3162,7 +3156,7 @@ u3x_sam_7, &n, 0)) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -3174,12 +3168,12 @@ c3n == u3ud(x_kind) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: + case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_linspace_real(a, b, n, x_bloq)); + u3_noun r_data = _soft_run(u3qi_la_linspace_i754(a, b, n, x_bloq)); x_shape = u3nt(u3x_atom(n), 0x1, u3_nul); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); @@ -3202,7 +3196,7 @@ u3x_sam_7, &d, 0)) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -3214,12 +3208,12 @@ c3n == u3ud(x_kind) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: + case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_range_real(a, b, d, x_bloq)); + u3_noun r_data = _soft_run(u3qi_la_range_i754(a, b, d, x_bloq)); c3_d a_, b_, d_; c3_ds n_; switch (x_bloq) { @@ -3274,7 +3268,7 @@ 0) || c3n == u3ud(x_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; x_shape = u3h(x_meta); // 2 @@ -3285,7 +3279,7 @@ c3n == u3ud(x_kind) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun r_data = _soft_run(u3qi_la_diag(x_data, x_shape, x_bloq)); c3_d len_x0 = _get_dims(x_shape)[0]; @@ -3306,7 +3300,7 @@ 0) || c3n == u3ud(x_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp; if ( c3n == u3r_mean(x_meta, @@ -3317,12 +3311,12 @@ 0) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: ; - u3_noun r_data = _soft_run(u3qi_la_trace_real(x_data, x_shape, x_bloq)); - return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + case c3__i754: { + u3_noun r_data = _soft_run(u3qi_la_trace_i754(x_data, x_shape, x_bloq)); + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: return u3_none; @@ -3347,7 +3341,7 @@ c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { u3_noun x_shape, x_bloq, x_kind, x_fxp, y_shape, y_bloq, y_kind, y_fxp, @@ -3366,12 +3360,12 @@ // fxp does not need to match so no check ) { - u3m_bail(c3__exit); + return u3m_bail(c3__exit); } else { switch (x_kind) { - case c3__real: + case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_mmul_real(x_data, y_data, x_shape, y_shape, x_bloq)); + u3_noun r_data = _soft_run(u3qi_la_mmul_i754(x_data, y_data, x_shape, y_shape, x_bloq)); // result is already [meta data] return r_data; diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index bb366844e2..720cd8a2de 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -245,34 +245,34 @@ u3_noun u3qfp_nepo(u3_noun, u3_noun); u3_noun u3qfp_rake(u3_noun); - u3_noun u3qi_la_add_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_sub_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_mul_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_div_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_mod_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_adds_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_subs_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_muls_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_divs_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_mods_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_dot_real(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_add_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_sub_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mul_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_div_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mod_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_adds_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_subs_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_muls_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_divs_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mods_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_dot_i754(u3_noun, u3_noun, u3_noun, u3_noun); u3_noun u3qi_la_diag(u3_noun, u3_noun, u3_noun); u3_noun u3qi_la_transpose(u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_cumsum_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_argmin_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_argmax_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_ravel_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_min_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_max_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_linspace_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_range_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_abs_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_gth_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_gte_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_lth_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_lte_real(u3_noun, u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_trace_real(u3_noun, u3_noun, u3_noun); - u3_noun u3qi_la_mmul_real(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_cumsum_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_argmin_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_argmax_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_ravel_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_min_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_max_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_linspace_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_range_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_abs_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_gth_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_gte_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_lth_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_lte_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_trace_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mmul_i754(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); # define u3qfu_van_fan 28 # define u3qfu_van_rib 58 From 22181027d584fdc50fa4084a9e0ded4aeb345569 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Tue, 23 Jul 2024 09:43:41 -0400 Subject: [PATCH 61/97] http: create function for beam logic (WIP) --- pkg/vere/io/http.c | 187 ++++++++++++++++++++++++--------------------- 1 file changed, 102 insertions(+), 85 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 39bbbe5538..ba4c64d5f9 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -7,6 +7,7 @@ #include "openssl/err.h" #include "openssl/ssl.h" #include "version.h" +#include typedef struct _u3_h2o_serv { h2o_globalconf_t fig_u; // h2o global config @@ -799,6 +800,97 @@ _parse_range(c3_c* txt_c, c3_w len_w) return cut; } +typedef struct _beam { + u3_noun who; + u3_noun des; + u3_noun cas; + u3_noun pur; +} beam; + +/* _get_beam: url to beam +*/ +static beam +_get_beam(u3_hreq* req_u, c3_c* txt_c, c3_w len_w) +{ + beam bem; + u3_http* htp_u = req_u->hon_u->htp_u; + u3_httd* htd_u = htp_u->htd_u; + u3_noun our = u3dc("scot", 'p', u3i_chubs(2, htd_u->car_u.pir_u->who_d)); + // get beak from path + // + for ( c3_w i_w = 0; i_w < 3; ++i_w ) { + u3_noun* wer; + if ( 0 == i_w ) { + wer = &bem.who; + } + else if ( 1 == i_w ) { + wer = &bem.des; + } + else { + wer = &bem.cas; + } + + // find '//' + if ( (len_w >= 2) + && ('/' == txt_c[0]) + && ('/' == txt_c[1]) ) + { + *wer = u3_nul; + txt_c++; + len_w--; + } + // skip '/' + else if ( (len_w > 0) && ('/' == txt_c[0]) ) { + txt_c++; + len_w--; + } + // '=' + if ( (len_w > 0) && ('=' == txt_c[0]) ) { + if ( 0 == i_w ) { + *wer = our; + } + else if ( 1 == i_w ) { + *wer = u3i_string("base"); + } + else { + req_u->peq_u->las_o = c3y; + } + txt_c++; + len_w--; + } + // slice cord + else { + c3_c* nex_c; + c3_c* tis_c = memchr(txt_c, '=', len_w); + c3_c* fas_c = memchr(txt_c, '/', len_w); + if ( tis_c && fas_c ) { + nex_c = c3_min(tis_c, fas_c); + } + else if ( tis_c ) { + nex_c = tis_c; + } + else { + nex_c = fas_c; + } + if ( !nex_c ) { + c3_c* msg_c = "bad beam"; + // h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); + // return; + } + else { + c3_w dif_w = (c3_p)(nex_c - txt_c); + *wer = u3i_bytes(dif_w, (const c3_y*)txt_c); + txt_c = nex_c; + len_w = len_w - dif_w; + } + } + } + + bem.pur = u3dc("rush", u3i_bytes(len_w, (const c3_y*)txt_c), u3v_wish("stap")); + + return bem; +} + /* _http_req_dispatch(): dispatch http request */ static void @@ -860,86 +952,11 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) gang = u3_nul; } - u3_noun who; - u3_noun des; - u3_noun cas; - - // XX move to function - // get beak from path - // - for ( c3_w i_w = 0; i_w < 3; ++i_w ) { - u3_noun* wer; - if ( 0 == i_w ) { - wer = &who; - } - else if ( 1 == i_w ) { - wer = &des; - } - else { - wer = &cas; - } - - // find '//' - if ( (len_w >= 2) - && ('/' == bas_c[0]) - && ('/' == bas_c[1]) ) - { - *wer = u3_nul; - bas_c++; - len_w--; - } - // skip '/' - else if ( (len_w > 0) && ('/' == bas_c[0]) ) { - bas_c++; - len_w--; - } - // '=' - if ( (len_w > 0) && ('=' == bas_c[0]) ) { - if ( 0 == i_w ) { - *wer = our; - } - else if ( 1 == i_w ) { - *wer = u3i_string("base"); - } - else { - req_u->peq_u->las_o = c3y; - } - bas_c++; - len_w--; - } - // slice cord - else { - c3_c* nex_c; - c3_c* tis_c = memchr(bas_c, '=', len_w); - c3_c* fas_c = memchr(bas_c, '/', len_w); - if ( tis_c && fas_c ) { - nex_c = c3_min(tis_c, fas_c); - } - else if ( tis_c ) { - nex_c = tis_c; - } - else { - nex_c = fas_c; - } - - if ( !nex_c ) { - c3_c* msg_c = "bad beam"; - h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); - return; - } - else { - c3_w dif_w = (c3_p)(nex_c - bas_c); - *wer = u3i_bytes(dif_w, (const c3_y*)bas_c); - bas_c = nex_c; - len_w = len_w - dif_w; - } - } - } - - u3_noun spur = u3dc("rush", u3i_bytes(len_w, (const c3_y*)bas_c), u3v_wish("stap")); + beam bem = _get_beam(req_u, bas_c, len_w); - if ( (u3_nul == spur) - || (c3n == u3r_sing(our, who)) ) + // XX necessary? + if ( (u3_nul == bem.pur) + || (c3n == u3r_sing(our, bem.who)) ) { c3_c* msg_c = "bad scry path"; h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); @@ -947,23 +964,23 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) } else { - spur = u3nc(u3i_string("mime"), u3t(spur)); + u3_noun spur = u3nc(u3i_string("mime"), u3t(bem.pur)); if ( c3y == req_u->peq_u->las_o ) { u3_pier_peek_last(htd_u->car_u.pir_u, gang, c3__ex, - des, spur, req_u->peq_u, _http_scry_cb); + bem.des, spur, req_u->peq_u, _http_scry_cb); } else { - u3_noun bem = u3nq(our, des, cas, spur); - u3_weak nac = u3h_get(htd_u->nax_p, bem); + u3_noun bam = u3nq(bem.who, bem.des, bem.cas, spur); + u3_weak nac = u3h_get(htd_u->nax_p, bam); if ( (u3_none == nac) || (u3_nul == nac) || ((u3_nul == gang) && (c3y == u3r_at(14, nac))) ) { // maybe cache, then serve subsequent range requests from cache - req_u->peq_u->pax = bem; - u3_pier_peek(htd_u->car_u.pir_u, gang, u3nt(0, c3__ex, u3k(bem)), + req_u->peq_u->pax = bam; + u3_pier_peek(htd_u->car_u.pir_u, gang, u3nt(0, c3__ex, u3k(bam)), req_u->peq_u, _http_scry_cb); } else { From b80e99a3906b2f85a25398424997744866cc8c08 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Tue, 23 Jul 2024 10:01:31 -0500 Subject: [PATCH 62/97] WIP jet hint call stack correct --- pkg/noun/jets/i/lagoon.c | 7 +++---- pkg/noun/jets/tree.c | 7 +------ 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index 9ead70d14a..6bf53d4a1e 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -121,7 +121,6 @@ if (bloq < 4 || bloq > 7) { return u3_none; } - fprintf(stderr, ">>> u3qi_la_add\n\r"); // Unpack the data as a byte array. We assume total length < 2**64. // len_x is length in base units @@ -2134,7 +2133,6 @@ // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, y_meta, y_data; - fprintf(stderr, "> u3wi_la_add\n\r"); if ( c3n == u3r_mean(cor, u3x_sam_4, &x_meta, @@ -2159,6 +2157,7 @@ y_kind = u3h(u3t(u3t(y_meta))); // 14 y_fxp = u3t(u3t(u3t(y_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + fprintf(stderr, "> u3wi_la_add\r\n"); if ( c3n == u3ud(x_bloq) || c3n == u3ud(y_bloq) || c3n == u3ud(x_kind) || @@ -2171,15 +2170,15 @@ { return u3m_bail(c3__exit); } else { - fprintf(stderr, "> u3wi_la_add\n\r"); + fprintf(stderr, "x_bloq: %x\r\n", x_kind); switch (x_kind) { case c3__i754: _set_rounding(rnd); - fprintf(stderr, ">> u3wi_la_add\n\r"); u3_noun r_data = _soft_run(u3qi_la_add_i754(x_data, y_data, x_shape, x_bloq)); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: + fprintf(stderr, "default\r\n"); return u3_none; } } diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index 8ee3037fae..7b6224ba97 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2185,13 +2185,8 @@ static u3j_core _139_non__la_core_d[] = {} }; -static u3j_core _139_non__lagoon_d[] = - { { "la-core", 7, 0, _139_non__la_core_d, no_hashes }, - {} - }; - static u3j_core _139_non_d[] = - { { "lagoon", 6, 0, _139_non__lagoon_d, no_hashes }, + { { "lagoon", 7, 0, _139_non__la_core_d, no_hashes }, {} }; From 439fdf335a1ace4496796e702bd8eb169718e5c3 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Tue, 23 Jul 2024 10:57:41 -0500 Subject: [PATCH 63/97] WIP debugging u3_none path --- pkg/noun/jets/i/lagoon.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index 6bf53d4a1e..eb92dc447f 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -2157,7 +2157,6 @@ y_kind = u3h(u3t(u3t(y_meta))); // 14 y_fxp = u3t(u3t(u3t(y_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 - fprintf(stderr, "> u3wi_la_add\r\n"); if ( c3n == u3ud(x_bloq) || c3n == u3ud(y_bloq) || c3n == u3ud(x_kind) || @@ -2170,7 +2169,7 @@ { return u3m_bail(c3__exit); } else { - fprintf(stderr, "x_bloq: %x\r\n", x_kind); + fprintf(stderr, "\r\nx_kind: %x\r\n", x_kind); switch (x_kind) { case c3__i754: _set_rounding(rnd); @@ -2178,7 +2177,7 @@ return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: - fprintf(stderr, "default\r\n"); + fprintf(stderr, "uint\r\n"); return u3_none; } } @@ -3234,15 +3233,16 @@ u3r_bytes(0, 8, (c3_y*)&d_, d); n_ = f64_to_i64(f64_div(f64_sub((float64_t){b_}, (float64_t){a_}), (float64_t){d_}), softfloat_round_minMag, false); break; - case 7: - u3r_bytes(0, 16, (c3_y*)&a_, a); - u3r_bytes(0, 16, (c3_y*)&b_, b); - u3r_bytes(0, 16, (c3_y*)&d_, d); + case 7: { + c3_d a__[2], b__[2], d__[2]; + u3r_bytes(0, 16, (c3_y*)&a__, a); + u3r_bytes(0, 16, (c3_y*)&b__, b); + u3r_bytes(0, 16, (c3_y*)&d__, d); float128_t tmp; - f128M_sub((float128_t*){&b_}, (float128_t*){&a_}, &tmp); - f128M_div(&tmp, (float128_t*){&d_}, &tmp); + f128M_sub((float128_t*){&b__}, (float128_t*){&a__}, &tmp); + f128M_div(&tmp, (float128_t*){&d__}, &tmp); n_ = f128M_to_i64(&tmp, softfloat_round_minMag, false); - break; + break;} } u3_noun n = u3i_chub(n_+1); x_shape = u3nt(u3k(n), 0x1, u3_nul); From 470ebe9edd9ea24188ac711b91433bcd8b42d4bb Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Wed, 24 Jul 2024 09:34:56 -0400 Subject: [PATCH 64/97] http: add _get_range, slice in _http_scry_cb (WIP) --- pkg/vere/io/http.c | 253 ++++++++++++++++++++++++++------------------- 1 file changed, 145 insertions(+), 108 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index ba4c64d5f9..cdfc2a2094 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -7,7 +7,6 @@ #include "openssl/err.h" #include "openssl/ssl.h" #include "version.h" -#include typedef struct _u3_h2o_serv { h2o_globalconf_t fig_u; // h2o global config @@ -672,54 +671,57 @@ _content_headers(c3_z beg_z, c3_z end_z, c3_w tot_w) return out; } -/* _http_scry_cb() +typedef struct _range_request { + c3_z beg_z; + c3_z end_z; +} range_request; + +/* _parse_range: get a range from '-' delimited text */ -static void -_http_scry_cb(void* vod_p, u3_noun nun) +static range_request +_parse_range(c3_c* txt_c, c3_w len_w) { - // XX slice here - u3_preq* peq_u = vod_p; - u3_httd* htd_u = peq_u->htd_u; - u3_hreq* req_u = peq_u->req_u; + c3_c* hep_c = memchr(txt_c, '-', len_w); + range_request cut; + cut.beg_z = SIZE_MAX; + cut.end_z = SIZE_MAX; - if ( req_u ) { - u3_assert(u3_rsat_peek == req_u->sat_e); - req_u->peq_u = 0; - if ( u3_nul != nun ) { - u3_noun len = u3r_at(254, nun); - c3_w len_w; - if ( c3n == u3r_safe_word(len, &len_w) ) { - u3z(nun); - _http_scry_respond(req_u, u3_nul); - u3z(peq_u->pax); - c3_free(peq_u); - return; - } - u3_noun hez = _content_headers(0, (len_w - 1), len_w); - u3_noun res = u3i_edit(u3k(nun), 125, u3qb_weld(hez, u3r_at(125, nun))); - _http_scry_respond(req_u, u3k(res)); - } - else { - _http_scry_respond(req_u, u3k(nun)); + if ( hep_c ) { + cut.beg_z = h2o_strtosize(txt_c, hep_c - txt_c); + cut.end_z = h2o_strtosize(hep_c + 1, len_w - ((hep_c + 1) - txt_c)); + // strange -> [SIZE_MAX SIZE_MAX] so we return u3_nul in _slice_octs + if ( ((SIZE_MAX == cut.beg_z) && (hep_c != txt_c)) + || ((SIZE_MAX == cut.end_z) && (len_w - ((hep_c + 1) - txt_c) > 0)) ) + { + cut.beg_z = SIZE_MAX; + cut.end_z = SIZE_MAX; } } + return cut; +} - // cache only if peek was not at now, and nun isn't u3_nul - if ( (c3n == peq_u->las_o) - && (u3_nul != nun) ) +static c3_o +_get_range(h2o_headers_t req_headers, range_request* rng_req) +{ + rng_req->beg_z = SIZE_MAX; + rng_req->end_z = SIZE_MAX; + + c3_w idx = h2o_find_header(&req_headers, H2O_TOKEN_RANGE, -1); + if (idx == UINT32_MAX) { + return c3n; + } + + if ( (req_headers.entries[idx].value.len >= 6) + && (0 == memcmp("bytes=", req_headers.entries[idx].value.base, 6)) ) { - // XX pair of auth & path for key - // check ~watter-parter's - u3h_put(htd_u->nax_p, peq_u->pax, nun); + range_request tmp = _parse_range(req_headers.entries[idx].value.base + 6, + req_headers.entries[idx].value.len - 6); + rng_req->beg_z = tmp.beg_z; + rng_req->end_z = tmp.end_z; } - u3z(peq_u->pax); - c3_free(peq_u); -} -typedef struct _range_request { - c3_z beg_z; - c3_z end_z; -} range_request; + return c3y; +} typedef struct _content { c3_z beg_z; @@ -727,10 +729,10 @@ typedef struct _content { u3_noun dat; // XX free } content; -/* _slice_mime: given a valid range, slice a section of octs +/* _slice_octs: given a valid range, slice a section of octs */ static content -_slice_mime(range_request rng, u3_noun octs) +_slice_octs(range_request rng, u3_noun octs) { content out; out.beg_z = SIZE_MAX; @@ -776,28 +778,74 @@ _slice_mime(range_request rng, u3_noun octs) return out; } -/* _parse_range: get a range from '-' delimited text +/* _http_scry_cb() */ -static range_request -_parse_range(c3_c* txt_c, c3_w len_w) +static void +_http_scry_cb(void* vod_p, u3_noun nun) { - c3_c* hep_c = memchr(txt_c, '-', len_w); - range_request cut; - cut.beg_z = SIZE_MAX; - cut.end_z = SIZE_MAX; + // XX free + u3_preq* peq_u = vod_p; + u3_httd* htd_u = peq_u->htd_u; + u3_hreq* req_u = peq_u->req_u; - if ( hep_c ) { - cut.beg_z = h2o_strtosize(txt_c, hep_c - txt_c); - cut.end_z = h2o_strtosize(hep_c + 1, len_w - ((hep_c + 1) - txt_c)); - // strange -> [SIZE_MAX SIZE_MAX] so we return u3_nul in _slice_mime - if ( ((SIZE_MAX == cut.beg_z) && (hep_c != txt_c)) - || ((SIZE_MAX == cut.end_z) && (len_w - ((hep_c + 1) - txt_c) > 0)) ) - { - cut.beg_z = SIZE_MAX; - cut.end_z = SIZE_MAX; + if ( req_u ) { + u3_assert(u3_rsat_peek == req_u->sat_e); + req_u->peq_u = 0; + if ( u3_nul == nun ) { + _http_scry_respond(req_u, u3k(nun)); + } + else { + h2o_headers_t req_headers = req_u->rec_u->headers; + range_request rng_req; + c3_o rng_o = _get_range(req_headers, &rng_req); + + if (c3n == rng_o ) { + // XX review + u3_noun len = u3r_at(254, nun); + c3_w len_w; + if ( c3n == u3r_safe_word(len, &len_w) ) { + u3z(nun); + _http_scry_respond(req_u, u3_nul); + u3z(peq_u->pax); + c3_free(peq_u); + return; + } + u3_noun hez = _content_headers(0, (len_w - 1), len_w); + u3_noun res = u3i_edit(u3k(nun), 125, u3qb_weld(hez, u3r_at(125, nun))); + _http_scry_respond(req_u, u3k(res)); + } + else { + u3_noun octs = u3r_at(127, nun); + if ( u3_none == octs ) { + h2o_send_error_500(req_u->rec_u, "Internal Server Error", "scry failed", 0); + } + content result = _slice_octs(rng_req, octs); + if ( u3_nul == result.dat ) { + c3_c* msg_c = "Requested Range Not Satisfiable"; + u3z(result.dat); + h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); + } + else { + u3_noun hez = _content_headers(result.beg_z, result.end_z, u3h(octs)); + u3_noun res = u3i_edit(nun, 127, result.dat); + res = u3i_edit(res, 124, 206); + res = u3i_edit(res, 125, u3qb_weld(hez, u3r_at(125, res))); + _http_scry_respond(req_u, res); + } + } } } - return cut; + + // cache only if peek was not at now, and nun isn't u3_nul + if ( (c3n == peq_u->las_o) + && (u3_nul != nun) ) + { + // XX pair of auth & path for key + // check ~watter-parter's + u3h_put(htd_u->nax_p, peq_u->pax, nun); + } + u3z(peq_u->pax); + c3_free(peq_u); } typedef struct _beam { @@ -807,7 +855,7 @@ typedef struct _beam { u3_noun pur; } beam; -/* _get_beam: url to beam +/* _get_beam: path to beam */ static beam _get_beam(u3_hreq* req_u, c3_c* txt_c, c3_w len_w) @@ -967,7 +1015,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) u3_noun spur = u3nc(u3i_string("mime"), u3t(bem.pur)); if ( c3y == req_u->peq_u->las_o ) { u3_pier_peek_last(htd_u->car_u.pir_u, gang, c3__ex, - bem.des, spur, req_u->peq_u, _http_scry_cb); + bem.des, spur, req_u->peq_u, _http_scry_cb); } else { @@ -981,64 +1029,53 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) // maybe cache, then serve subsequent range requests from cache req_u->peq_u->pax = bam; u3_pier_peek(htd_u->car_u.pir_u, gang, u3nt(0, c3__ex, u3k(bam)), - req_u->peq_u, _http_scry_cb); + req_u->peq_u, _http_scry_cb); } else { - // XX function h2o_headers_t req_headers = req_u->rec_u->headers; - c3_w idx = h2o_find_header(&req_headers, H2O_TOKEN_RANGE, -1); + range_request rng_req; + c3_o rng_o = _get_range(req_headers, &rng_req); - if (idx == UINT32_MAX) { + if ( c3n == rng_o) { _http_cache_respond(req_u, nac); } else { - if ( (req_headers.entries[idx].value.len >= 6) && - (0 == memcmp("bytes=", req_headers.entries[idx].value.base, 6)) ) { - c3_w rest_len = req_headers.entries[idx].value.len - 6; - if ( 0 == rest_len ) { - c3_c* msg_c = "Requested Range Not Satisfiable"; - h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); - // XX leaks - return; - } - range_request rng_req = _parse_range(req_headers.entries[idx].value.base + 6, rest_len); - u3_noun octs = u3r_at(127, nac); - if ( u3_none == octs ) { - h2o_send_error_500(rec_u, "Internal Server Error", "scry failed", 0); - return; - } - content result = _slice_mime(rng_req, octs); + u3_noun octs = u3r_at(127, nac); + if ( u3_none == octs ) { + h2o_send_error_500(rec_u, "Internal Server Error", "scry failed", 0); + return; + } + content result = _slice_octs(rng_req, octs); - if ( u3_nul == result.dat ) { - c3_c* msg_c = "Requested Range Not Satisfiable"; - u3z(result.dat); - h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); - return; - } + if ( u3_nul == result.dat ) { + c3_c* msg_c = "Requested Range Not Satisfiable"; + u3z(result.dat); + h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); + return; + } - if ( c3y == u3r_sing(result.dat, octs) ) { - // 200 - u3z(result.dat); - u3_noun len = u3r_at(254, nac); - c3_w len_w; - if ( c3n == u3r_safe_word(len, &len_w) ) { - u3z(nac); - _http_scry_respond(req_u, u3_nul); - return; - } - u3_noun hez = _content_headers(0, (len_w - 1), len_w); - u3_noun res = u3i_edit(nac, 125, u3qb_weld(hez, u3r_at(125, nac))); - _http_cache_respond(req_u, res); - } - else { - // 206 - u3_noun hez = _content_headers(result.beg_z, result.end_z, u3h(octs)); - u3_noun res = u3i_edit(nac, 127, result.dat); - res = u3i_edit(res, 124, 206); - res = u3i_edit(res, 125, u3qb_weld(hez, u3r_at(125, res))); - _http_cache_respond(req_u, res); + if ( c3y == u3r_sing(result.dat, octs) ) { + // 200 + u3z(result.dat); + u3_noun len = u3r_at(254, nac); + c3_w len_w; + if ( c3n == u3r_safe_word(len, &len_w) ) { + u3z(nac); + _http_scry_respond(req_u, u3_nul); + return; } + u3_noun hez = _content_headers(0, (len_w - 1), len_w); + u3_noun res = u3i_edit(nac, 125, u3qb_weld(hez, u3r_at(125, nac))); + _http_cache_respond(req_u, res); + } + else { + // 206 + u3_noun hez = _content_headers(result.beg_z, result.end_z, u3h(octs)); + u3_noun res = u3i_edit(nac, 127, result.dat); + res = u3i_edit(res, 124, 206); + res = u3i_edit(res, 125, u3qb_weld(hez, u3r_at(125, res))); + _http_cache_respond(req_u, res); } } } From 20f30fc10ecef907375b1ca5f78313a5b9141a6f Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Wed, 24 Jul 2024 13:53:57 -0400 Subject: [PATCH 65/97] http: add _http_range_respond --- pkg/vere/io/http.c | 114 +++++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 62 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index cdfc2a2094..525ad353c4 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -778,12 +778,52 @@ _slice_octs(range_request rng, u3_noun octs) return out; } +static void +_http_range_respond(u3_hreq* req_u, u3_noun nac, range_request rng_req) +{ + u3_noun octs = u3r_at(127, nac); + if ( u3_none == octs ) { + h2o_send_error_500(req_u->rec_u, "Internal Server Error", "scry failed", 0); + return; + } + content result = _slice_octs(rng_req, octs); + + if ( u3_nul == result.dat ) { + c3_c* msg_c = "Requested Range Not Satisfiable"; + u3z(result.dat); + h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); + return; + } + + if ( c3y == u3r_sing(result.dat, octs) ) { + // 200 + u3z(result.dat); + u3_noun len = u3r_at(254, nac); + c3_w len_w; + if ( c3n == u3r_safe_word(len, &len_w) ) { + u3z(nac); + _http_scry_respond(req_u, u3_nul); + return; + } + u3_noun hez = _content_headers(0, (len_w - 1), len_w); + u3_noun res = u3i_edit(nac, 125, u3qb_weld(hez, u3r_at(125, nac))); + _http_cache_respond(req_u, res); + } + else { + // 206 + u3_noun hez = _content_headers(result.beg_z, result.end_z, u3h(octs)); + u3_noun res = u3i_edit(nac, 127, result.dat); + res = u3i_edit(res, 124, 206); + res = u3i_edit(res, 125, u3qb_weld(hez, u3r_at(125, res))); + _http_cache_respond(req_u, res); + } +} + /* _http_scry_cb() */ static void _http_scry_cb(void* vod_p, u3_noun nun) { - // XX free u3_preq* peq_u = vod_p; u3_httd* htd_u = peq_u->htd_u; u3_hreq* req_u = peq_u->req_u; @@ -799,7 +839,10 @@ _http_scry_cb(void* vod_p, u3_noun nun) range_request rng_req; c3_o rng_o = _get_range(req_headers, &rng_req); - if (c3n == rng_o ) { + if (c3y == rng_o ) { + _http_range_respond(req_u, nun, rng_req); + } + else { // XX review u3_noun len = u3r_at(254, nun); c3_w len_w; @@ -814,25 +857,6 @@ _http_scry_cb(void* vod_p, u3_noun nun) u3_noun res = u3i_edit(u3k(nun), 125, u3qb_weld(hez, u3r_at(125, nun))); _http_scry_respond(req_u, u3k(res)); } - else { - u3_noun octs = u3r_at(127, nun); - if ( u3_none == octs ) { - h2o_send_error_500(req_u->rec_u, "Internal Server Error", "scry failed", 0); - } - content result = _slice_octs(rng_req, octs); - if ( u3_nul == result.dat ) { - c3_c* msg_c = "Requested Range Not Satisfiable"; - u3z(result.dat); - h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); - } - else { - u3_noun hez = _content_headers(result.beg_z, result.end_z, u3h(octs)); - u3_noun res = u3i_edit(nun, 127, result.dat); - res = u3i_edit(res, 124, 206); - res = u3i_edit(res, 125, u3qb_weld(hez, u3r_at(125, res))); - _http_scry_respond(req_u, res); - } - } } } @@ -1032,51 +1056,15 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) req_u->peq_u, _http_scry_cb); } else { - // XX function h2o_headers_t req_headers = req_u->rec_u->headers; range_request rng_req; c3_o rng_o = _get_range(req_headers, &rng_req); - if ( c3n == rng_o) { - _http_cache_respond(req_u, nac); + if ( c3y == rng_o) { + _http_range_respond(req_u, nac, rng_req); } else { - u3_noun octs = u3r_at(127, nac); - if ( u3_none == octs ) { - h2o_send_error_500(rec_u, "Internal Server Error", "scry failed", 0); - return; - } - content result = _slice_octs(rng_req, octs); - - if ( u3_nul == result.dat ) { - c3_c* msg_c = "Requested Range Not Satisfiable"; - u3z(result.dat); - h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); - return; - } - - if ( c3y == u3r_sing(result.dat, octs) ) { - // 200 - u3z(result.dat); - u3_noun len = u3r_at(254, nac); - c3_w len_w; - if ( c3n == u3r_safe_word(len, &len_w) ) { - u3z(nac); - _http_scry_respond(req_u, u3_nul); - return; - } - u3_noun hez = _content_headers(0, (len_w - 1), len_w); - u3_noun res = u3i_edit(nac, 125, u3qb_weld(hez, u3r_at(125, nac))); - _http_cache_respond(req_u, res); - } - else { - // 206 - u3_noun hez = _content_headers(result.beg_z, result.end_z, u3h(octs)); - u3_noun res = u3i_edit(nac, 127, result.dat); - res = u3i_edit(res, 124, 206); - res = u3i_edit(res, 125, u3qb_weld(hez, u3r_at(125, res))); - _http_cache_respond(req_u, res); - } + _http_cache_respond(req_u, nac); } } } @@ -1086,7 +1074,8 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) } static void -_http_scry_respond(u3_hreq* req_u, u3_noun nun) { +_http_scry_respond(u3_hreq* req_u, u3_noun nun) +{ h2o_req_t* rec_u = req_u->rec_u; u3_httd* htd_u = req_u->hon_u->htp_u->htd_u; @@ -1130,7 +1119,8 @@ _http_scry_respond(u3_hreq* req_u, u3_noun nun) { /* _http_cache_respond(): respond with a simple-payload:http */ static void -_http_cache_respond(u3_hreq* req_u, u3_noun nun) { +_http_cache_respond(u3_hreq* req_u, u3_noun nun) +{ h2o_req_t* rec_u = req_u->rec_u; u3_httd* htd_u = req_u->hon_u->htp_u->htd_u; From 1a2d6597a277ef9eb6e152bec13981fdb289fba8 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Thu, 25 Jul 2024 12:47:13 -0400 Subject: [PATCH 66/97] http: put auth in cache key, cleanup --- pkg/vere/io/http.c | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 525ad353c4..af0dbf2ee9 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -783,12 +783,15 @@ _http_range_respond(u3_hreq* req_u, u3_noun nac, range_request rng_req) { u3_noun octs = u3r_at(127, nac); if ( u3_none == octs ) { - h2o_send_error_500(req_u->rec_u, "Internal Server Error", "scry failed", 0); + // 400 + c3_c* msg_c = "bad request"; + h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); return; } content result = _slice_octs(rng_req, octs); if ( u3_nul == result.dat ) { + // 416 c3_c* msg_c = "Requested Range Not Satisfiable"; u3z(result.dat); h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); @@ -796,7 +799,7 @@ _http_range_respond(u3_hreq* req_u, u3_noun nac, range_request rng_req) } if ( c3y == u3r_sing(result.dat, octs) ) { - // 200 + // 200 u3z(result.dat); u3_noun len = u3r_at(254, nac); c3_w len_w; @@ -810,7 +813,7 @@ _http_range_respond(u3_hreq* req_u, u3_noun nac, range_request rng_req) _http_cache_respond(req_u, res); } else { - // 206 + // 206 u3_noun hez = _content_headers(result.beg_z, result.end_z, u3h(octs)); u3_noun res = u3i_edit(nac, 127, result.dat); res = u3i_edit(res, 124, 206); @@ -827,6 +830,8 @@ _http_scry_cb(void* vod_p, u3_noun nun) u3_preq* peq_u = vod_p; u3_httd* htd_u = peq_u->htd_u; u3_hreq* req_u = peq_u->req_u; + u3_hfig* fig_u = &req_u->hon_u->htp_u->htd_u->fig_u; + c3_o auth = _http_req_is_auth(fig_u, req_u->rec_u); if ( req_u ) { u3_assert(u3_rsat_peek == req_u->sat_e); @@ -843,30 +848,18 @@ _http_scry_cb(void* vod_p, u3_noun nun) _http_range_respond(req_u, nun, rng_req); } else { - // XX review - u3_noun len = u3r_at(254, nun); - c3_w len_w; - if ( c3n == u3r_safe_word(len, &len_w) ) { - u3z(nun); - _http_scry_respond(req_u, u3_nul); - u3z(peq_u->pax); - c3_free(peq_u); - return; - } - u3_noun hez = _content_headers(0, (len_w - 1), len_w); - u3_noun res = u3i_edit(u3k(nun), 125, u3qb_weld(hez, u3r_at(125, nun))); - _http_scry_respond(req_u, u3k(res)); + _http_cache_respond(req_u, u3k(nun)); } } } // cache only if peek was not at now, and nun isn't u3_nul - if ( (c3n == peq_u->las_o) - && (u3_nul != nun) ) - { + if ( c3n == peq_u->las_o ) { // XX pair of auth & path for key // check ~watter-parter's - u3h_put(htd_u->nax_p, peq_u->pax, nun); + u3_noun key = u3nc(auth, u3k(peq_u->pax)); + u3h_put(htd_u->nax_p, key, nun); + u3z(key); } u3z(peq_u->pax); c3_free(peq_u); @@ -1044,15 +1037,17 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) else { u3_noun bam = u3nq(bem.who, bem.des, bem.cas, spur); - u3_weak nac = u3h_get(htd_u->nax_p, bam); + u3_noun key = u3nc(auth, u3k(bam)); + u3_weak nac = u3h_get(htd_u->nax_p, key); + u3z(key); if ( (u3_none == nac) || (u3_nul == nac) || ((u3_nul == gang) && (c3y == u3r_at(14, nac))) ) { - // maybe cache, then serve subsequent range requests from cache - req_u->peq_u->pax = bam; - u3_pier_peek(htd_u->car_u.pir_u, gang, u3nt(0, c3__ex, u3k(bam)), + // cache, then serve subsequent range requests from cache + req_u->peq_u->pax = u3k(bam); + u3_pier_peek(htd_u->car_u.pir_u, gang, u3nt(0, c3__ex, bam), req_u->peq_u, _http_scry_cb); } else { @@ -1177,7 +1172,6 @@ _http_cache_scry_cb(void* vod_p, u3_noun nun) _http_cache_respond(req_u, u3k(nun)); } - // XX pair of auth & path for key u3h_put(htd_u->nax_p, peq_u->pax, nun); u3z(peq_u->pax); c3_free(peq_u); From ab03d62d4d431f06608d6a17d65824458aeec613 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Fri, 26 Jul 2024 12:53:23 -0500 Subject: [PATCH 67/97] Turn off Nock verification. --- pkg/noun/jets/i/lagoon.c | 2 -- pkg/noun/jets/tree.c | 58 ++++++++++++++++++++-------------------- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index eb92dc447f..4466018f48 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -2169,7 +2169,6 @@ { return u3m_bail(c3__exit); } else { - fprintf(stderr, "\r\nx_kind: %x\r\n", x_kind); switch (x_kind) { case c3__i754: _set_rounding(rnd); @@ -2177,7 +2176,6 @@ return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: - fprintf(stderr, "uint\r\n"); return u3_none; } } diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index 7b6224ba97..19335435f7 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2125,34 +2125,34 @@ static u3j_core _139_hex_json_d[] = /* /lib jets in non core */ -static u3j_harm _139_non__lagoon_add_a[] = {{".2", u3wi_la_add, c3n}, {}}; -static u3j_harm _139_non__lagoon_sub_a[] = {{".2", u3wi_la_sub, c3n}, {}}; -static u3j_harm _139_non__lagoon_mul_a[] = {{".2", u3wi_la_mul, c3n}, {}}; -static u3j_harm _139_non__lagoon_div_a[] = {{".2", u3wi_la_div, c3n}, {}}; -static u3j_harm _139_non__lagoon_mod_a[] = {{".2", u3wi_la_mod, c3n}, {}}; -static u3j_harm _139_non__lagoon_adds_a[] = {{".2", u3wi_la_adds, c3n}, {}}; -static u3j_harm _139_non__lagoon_subs_a[] = {{".2", u3wi_la_subs, c3n}, {}}; -static u3j_harm _139_non__lagoon_muls_a[] = {{".2", u3wi_la_muls, c3n}, {}}; -static u3j_harm _139_non__lagoon_divs_a[] = {{".2", u3wi_la_divs, c3n}, {}}; -static u3j_harm _139_non__lagoon_mods_a[] = {{".2", u3wi_la_mods, c3n}, {}}; -static u3j_harm _139_non__lagoon_dot_a[] = {{".2", u3wi_la_dot, c3n}, {}}; -static u3j_harm _139_non__lagoon_trans_a[] ={{".2", u3wi_la_transpose, c3n}, {}}; -static u3j_harm _139_non__lagoon_cumsum_a[]={{".2", u3wi_la_cumsum, c3n}, {}}; -static u3j_harm _139_non__lagoon_argmin_a[]={{".2", u3wi_la_argmin, c3n}, {}}; -static u3j_harm _139_non__lagoon_argmax_a[]={{".2", u3wi_la_argmax, c3n}, {}}; -static u3j_harm _139_non__lagoon_ravel_a[]={{".2", u3wi_la_ravel, c3n}, {}}; -static u3j_harm _139_non__lagoon_min_a[] = {{".2", u3wi_la_min, c3n}, {}}; -static u3j_harm _139_non__lagoon_max_a[] = {{".2", u3wi_la_max, c3n}, {}}; -static u3j_harm _139_non__lagoon_linspace_a[]={{".2", u3wi_la_linspace, c3n}, {}}; -static u3j_harm _139_non__lagoon_range_a[]= {{".2", u3wi_la_range, c3n}, {}}; -static u3j_harm _139_non__lagoon_abs_a[] = {{".2", u3wi_la_abs, c3n}, {}}; -static u3j_harm _139_non__lagoon_gth_a[] = {{".2", u3wi_la_gth, c3n}, {}}; -static u3j_harm _139_non__lagoon_gte_a[] = {{".2", u3wi_la_gte, c3n}, {}}; -static u3j_harm _139_non__lagoon_lth_a[] = {{".2", u3wi_la_lth, c3n}, {}}; -static u3j_harm _139_non__lagoon_lte_a[] = {{".2", u3wi_la_lte, c3n}, {}}; -static u3j_harm _139_non__lagoon_diag_a[] = {{".2", u3wi_la_diag, c3n}, {}}; -static u3j_harm _139_non__lagoon_trace_a[]= {{".2", u3wi_la_trace, c3n}, {}}; -static u3j_harm _139_non__lagoon_mmul_a[] = {{".2", u3wi_la_mmul, c3n}, {}}; +static u3j_harm _139_non__lagoon_add_a[] = {{".2", u3wi_la_add}, {}}; +static u3j_harm _139_non__lagoon_sub_a[] = {{".2", u3wi_la_sub}, {}}; +static u3j_harm _139_non__lagoon_mul_a[] = {{".2", u3wi_la_mul}, {}}; +static u3j_harm _139_non__lagoon_div_a[] = {{".2", u3wi_la_div}, {}}; +static u3j_harm _139_non__lagoon_mod_a[] = {{".2", u3wi_la_mod}, {}}; +static u3j_harm _139_non__lagoon_adds_a[] = {{".2", u3wi_la_adds}, {}}; +static u3j_harm _139_non__lagoon_subs_a[] = {{".2", u3wi_la_subs}, {}}; +static u3j_harm _139_non__lagoon_muls_a[] = {{".2", u3wi_la_muls}, {}}; +static u3j_harm _139_non__lagoon_divs_a[] = {{".2", u3wi_la_divs}, {}}; +static u3j_harm _139_non__lagoon_mods_a[] = {{".2", u3wi_la_mods}, {}}; +static u3j_harm _139_non__lagoon_dot_a[] = {{".2", u3wi_la_dot}, {}}; +static u3j_harm _139_non__lagoon_trans_a[] ={{".2", u3wi_la_transpose}, {}}; +static u3j_harm _139_non__lagoon_cumsum_a[]={{".2", u3wi_la_cumsum}, {}}; +static u3j_harm _139_non__lagoon_argmin_a[]={{".2", u3wi_la_argmin}, {}}; +static u3j_harm _139_non__lagoon_argmax_a[]={{".2", u3wi_la_argmax}, {}}; +static u3j_harm _139_non__lagoon_ravel_a[]={{".2", u3wi_la_ravel}, {}}; +static u3j_harm _139_non__lagoon_min_a[] = {{".2", u3wi_la_min}, {}}; +static u3j_harm _139_non__lagoon_max_a[] = {{".2", u3wi_la_max}, {}}; +static u3j_harm _139_non__lagoon_linspace_a[]={{".2", u3wi_la_linspace}, {}}; +static u3j_harm _139_non__lagoon_range_a[]= {{".2", u3wi_la_range}, {}}; +static u3j_harm _139_non__lagoon_abs_a[] = {{".2", u3wi_la_abs}, {}}; +static u3j_harm _139_non__lagoon_gth_a[] = {{".2", u3wi_la_gth}, {}}; +static u3j_harm _139_non__lagoon_gte_a[] = {{".2", u3wi_la_gte}, {}}; +static u3j_harm _139_non__lagoon_lth_a[] = {{".2", u3wi_la_lth}, {}}; +static u3j_harm _139_non__lagoon_lte_a[] = {{".2", u3wi_la_lte}, {}}; +static u3j_harm _139_non__lagoon_diag_a[] = {{".2", u3wi_la_diag}, {}}; +static u3j_harm _139_non__lagoon_trace_a[]= {{".2", u3wi_la_trace}, {}}; +static u3j_harm _139_non__lagoon_mmul_a[] = {{".2", u3wi_la_mmul}, {}}; static u3j_core _139_non__la_core_d[] = { { "add-rays", 7, _139_non__lagoon_add_a, 0, no_hashes }, { "sub-rays", 7, _139_non__lagoon_sub_a, 0, no_hashes }, @@ -2169,7 +2169,7 @@ static u3j_core _139_non__la_core_d[] = { "cumsum", 7, _139_non__lagoon_cumsum_a, 0, no_hashes }, { "argmin", 7, _139_non__lagoon_argmin_a, 0, no_hashes }, { "argmax", 7, _139_non__lagoon_argmax_a, 0, no_hashes }, - { "ravel", 7, _139_non__lagoon_ravel_a, 0, no_hashes }, + // { "ravel", 7, _139_non__lagoon_ravel_a, 0, no_hashes }, { "min", 7, _139_non__lagoon_min_a, 0, no_hashes }, { "max", 7, _139_non__lagoon_max_a, 0, no_hashes }, { "linspace", 7, _139_non__lagoon_linspace_a, 0, no_hashes }, From a3e7b428eb2d0add6a03d3ed213582bc84e3ccd9 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Fri, 26 Jul 2024 19:44:23 -0500 Subject: [PATCH 68/97] Post working jets. --- pkg/noun/jets/i/lagoon.c | 138 +++++++++++++++++++++++++++------------ pkg/noun/jets/tree.c | 2 +- 2 files changed, 97 insertions(+), 43 deletions(-) diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index 4466018f48..01e8e9954a 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -108,6 +108,26 @@ return dims; } +/* check consistency of array shape and bloq size + |= =ray + ^- ? + .= (roll shape.meta.ray ^mul) + (dec (met bloq.meta.ray data.ray)) +*/ + static inline c3_o _check(u3_noun ray) + { + // Calculate expected size. + u3_atom shp = u3h(u3h(ray)); // (reported) shape of ray, +4 + u3_atom blq = u3h(u3t(u3h(ray))); // block size of ray, +10 + u3_atom sin = _get_length(shp); // calculated length of ray + + // Calculate actual size. + u3_atom len = u3r_met(blq, u3t(ray)); // length of ray + u3_atom dex = u3qa_dec(len); // decrement length b/c of pinned 1 + + return __(sin == dex); + } + /* add - axpy = 1*x+y */ u3_noun @@ -1859,14 +1879,14 @@ u3r_bytes(0, 2, (c3_y*)&(b16.v), b); float16_t span16 = f16_sub(b16, a16); float16_t interval16 = f16_div(span16, i32_to_f16(n-1)); - c3_y* x_bytes16 = (c3_y*)u3a_malloc(((n+1)*2+1)*sizeof(c3_y)); - for (c3_d i = 1; i <= n; i++) { - ((float16_t*)x_bytes16)[n-i] = f16_add(a16, f16_mul(i32_to_f16(i), interval16)); - } - ((float16_t*)x_bytes16)[n] = a16; - ((float16_t*)x_bytes16)[0] = b16; - x_bytes16[(n+1)*2] = 0x1; // pin head - r_data = u3i_bytes(((n+1)*2+1)*sizeof(c3_y), x_bytes16); + c3_y* x_bytes16 = (c3_y*)u3a_malloc((n*2+1)*sizeof(c3_y)); + for (c3_d i = 1; i < n-1; i++) { + ((float16_t*)x_bytes16)[i] = f16_add(a16, f16_mul(i32_to_f16(i), interval16)); + } + ((float16_t*)x_bytes16)[0] = a16; + ((float16_t*)x_bytes16)[n-1] = b16; + x_bytes16[n*2] = 0x1; // pin head + r_data = u3i_bytes((n*2+1)*sizeof(c3_y), x_bytes16); u3a_free(x_bytes16); break;} @@ -1876,14 +1896,14 @@ u3r_bytes(0, 4, (c3_y*)&(b32.v), b); float32_t span32 = f32_sub(b32, a32); float32_t interval32 = f32_div(span32, i32_to_f32(n-1)); - c3_y* x_bytes32 = (c3_y*)u3a_malloc(((n+1)*4+1)*sizeof(c3_y)); - for (c3_d i = 1; i <= n; i++) { - ((float32_t*)x_bytes32)[n-i] = f32_add(a32, f32_mul(i32_to_f32(i), interval32)); - } - ((float32_t*)x_bytes32)[n] = a32; - ((float32_t*)x_bytes32)[0] = b32; - x_bytes32[(n+1)*4] = 0x1; // pin head - r_data = u3i_bytes(((n+1)*4+1)*sizeof(c3_y), x_bytes32); + c3_y* x_bytes32 = (c3_y*)u3a_malloc((n*4+1)*sizeof(c3_y)); + for (c3_d i = 1; i < n-1; i++) { + ((float32_t*)x_bytes32)[i] = f32_add(a32, f32_mul(i32_to_f32(i), interval32)); + } + ((float32_t*)x_bytes32)[0] = a32; + ((float32_t*)x_bytes32)[n-1] = b32; + x_bytes32[n*4] = 0x1; // pin head + r_data = u3i_bytes((n*4+1)*sizeof(c3_y), x_bytes32); u3a_free(x_bytes32); break;} @@ -1893,14 +1913,14 @@ u3r_bytes(0, 8, (c3_y*)&(b64.v), b); float64_t span64 = f64_sub(b64, a64); float64_t interval64 = f64_div(span64, i32_to_f64(n-1)); - c3_y* x_bytes64 = (c3_y*)u3a_malloc(((n+1)*8+1)*sizeof(c3_y)); - for (c3_d i = 1; i < n; i++) { - ((float64_t*)x_bytes64)[n-i] = f64_add(a64, f64_mul(i32_to_f64(i), interval64)); - } - ((float64_t*)x_bytes64)[n] = a64; - ((float64_t*)x_bytes64)[0] = b64; - x_bytes64[(n+1)*8] = 0x1; // pin head - r_data = u3i_bytes(((n+1)*8+1)*sizeof(c3_y), x_bytes64); + c3_y* x_bytes64 = (c3_y*)u3a_malloc((n*8+1)*sizeof(c3_y)); + for (c3_d i = 1; i < n-1; i++) { + ((float64_t*)x_bytes64)[i] = f64_add(a64, f64_mul(i32_to_f64(i), interval64)); + } + ((float64_t*)x_bytes64)[0] = a64; + ((float64_t*)x_bytes64)[n-1] = b64; + x_bytes64[n*8] = 0x1; // pin head + r_data = u3i_bytes((n*8+1)*sizeof(c3_y), x_bytes64); u3a_free(x_bytes64); break;} @@ -1914,17 +1934,17 @@ float128_t n128; i32_to_f128M(n-1, &n128); f128M_div(&span128, &n128, &interval128); - c3_y* x_bytes128 = (c3_y*)u3a_malloc(((n+1)*16+1)*sizeof(c3_y)); + c3_y* x_bytes128 = (c3_y*)u3a_malloc((n*16+1)*sizeof(c3_y)); float128_t i128; - for (c3_d i = 1; i < n; i++) { + for (c3_d i = 1; i < n-1; i++) { i32_to_f128M(i, &i128); - f128M_mul(&i128, &interval128, &((float128_t*)x_bytes128)[n-i]); - f128M_add(&a128, &((float128_t*)x_bytes128)[n-i], &((float128_t*)x_bytes128)[n-i]); + f128M_mul(&i128, &interval128, &((float128_t*)x_bytes128)[i]); + f128M_add(&a128, &((float128_t*)x_bytes128)[i], &((float128_t*)x_bytes128)[i]); } - ((float128_t*)x_bytes128)[n] = a128; - ((float128_t*)x_bytes128)[0] = b128; - x_bytes128[(n+1)*16] = 0x1; // pin head - r_data = u3i_bytes(((n+1)*16+1)*sizeof(c3_y), x_bytes128); + ((float128_t*)x_bytes128)[0] = a128; + ((float128_t*)x_bytes128)[n-1] = b128; + x_bytes128[n*16] = 0x1; // pin head + r_data = u3i_bytes((n*16+1)*sizeof(c3_y), x_bytes128); u3a_free(x_bytes128); break;} } @@ -2173,6 +2193,7 @@ case c3__i754: _set_rounding(rnd); u3_noun r_data = _soft_run(u3qi_la_add_i754(x_data, y_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2228,6 +2249,7 @@ case c3__i754: _set_rounding(rnd); u3_noun r_data = _soft_run(u3qi_la_sub_i754(x_data, y_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2283,6 +2305,7 @@ case c3__i754: _set_rounding(rnd); u3_noun r_data = _soft_run(u3qi_la_mul_i754(x_data, y_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2338,6 +2361,7 @@ case c3__i754: _set_rounding(rnd); u3_noun r_data = _soft_run(u3qi_la_div_i754(x_data, y_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2393,6 +2417,7 @@ case c3__i754: _set_rounding(rnd); u3_noun r_data = _soft_run(u3qi_la_mod_i754(x_data, y_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2424,7 +2449,8 @@ x_fxp = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(x_kind) + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) ) { return u3m_bail(c3__exit); @@ -2433,6 +2459,7 @@ case c3__i754: _set_rounding(rnd); u3_noun r_data = _soft_run(u3qi_la_cumsum_i754(x_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2462,7 +2489,8 @@ x_kind = u3h(u3t(u3t(x_meta))); // 14 x_fxp = u3t(u3t(u3t(x_meta))); // 15 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(x_kind) + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) ) { return u3m_bail(c3__exit); @@ -2499,7 +2527,8 @@ x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(x_kind) + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) ) { return u3m_bail(c3__exit); @@ -2537,7 +2566,8 @@ x_kind = u3h(u3t(u3t(x_meta))); // 14 x_fxp = u3t(u3t(u3t(x_meta))); // 15 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(x_kind) + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) ) { return u3m_bail(c3__exit); @@ -2575,7 +2605,8 @@ x_kind = u3h(u3t(u3t(x_meta))); // 14 x_fxp = u3t(u3t(u3t(x_meta))); // 15 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(x_kind) + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) ) { return u3m_bail(c3__exit); @@ -2583,6 +2614,7 @@ switch (x_kind) { case c3__i754: { u3_noun r_data = _soft_run(u3qi_la_min_i754(x_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: @@ -2612,7 +2644,8 @@ x_kind = u3h(u3t(u3t(x_meta))); // 14 x_fxp = u3t(u3t(u3t(x_meta))); // 15 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(x_kind) + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) ) { return u3m_bail(c3__exit); @@ -2620,6 +2653,7 @@ switch (x_kind) { case c3__i754: { u3_noun r_data = _soft_run(u3qi_la_max_i754(x_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: @@ -2657,6 +2691,7 @@ switch (x_kind) { case c3__i754: { u3_noun r_data = _soft_run(u3qi_la_abs_i754(x_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: @@ -2710,6 +2745,7 @@ switch (x_kind) { case c3__i754: { u3_noun r_data = _soft_run(u3qi_la_gth_i754(x_data, y_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: @@ -2763,6 +2799,7 @@ switch (x_kind) { case c3__i754: { u3_noun r_data = _soft_run(u3qi_la_gte_i754(x_data, y_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: @@ -2816,6 +2853,7 @@ switch (x_kind) { case c3__i754: { u3_noun r_data = _soft_run(u3qi_la_lth_i754(x_data, y_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: @@ -2869,6 +2907,7 @@ switch (x_kind) { case c3__i754: { u3_noun r_data = _soft_run(u3qi_la_lte_i754(x_data, y_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: @@ -2905,6 +2944,7 @@ case c3__i754: _set_rounding(rnd); u3_noun r_data = _soft_run(u3qi_la_adds_i754(x_data, n, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2940,6 +2980,7 @@ case c3__i754: _set_rounding(rnd); u3_noun r_data = _soft_run(u3qi_la_subs_i754(x_data, n, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -2975,6 +3016,7 @@ case c3__i754: _set_rounding(rnd); u3_noun r_data = _soft_run(u3qi_la_muls_i754(x_data, n, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -3010,6 +3052,7 @@ case c3__i754: _set_rounding(rnd); u3_noun r_data = _soft_run(u3qi_la_divs_i754(x_data, n, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -3045,6 +3088,7 @@ case c3__i754: _set_rounding(rnd); u3_noun r_data = _soft_run(u3qi_la_mods_i754(x_data, n, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -3099,6 +3143,7 @@ case c3__i754: _set_rounding(rnd); u3_noun r_data = _soft_run(u3qi_la_dot_i754(x_data, y_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } c3_d len_x0 = _get_dims(x_shape)[0]; return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); @@ -3129,12 +3174,14 @@ x_kind = u3h(u3t(u3t(x_meta))); // 14 x_fxp = u3t(u3t(u3t(x_meta))); // 15 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(x_kind) + c3n == u3ud(x_kind) || + c3n == _check(cor) ) { return u3m_bail(c3__exit); } else { u3_noun r_data = _soft_run(u3qi_la_transpose(x_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3nt(u3k(u3h(x_shape)), u3k(u3h(u3t(x_shape))), u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); } } @@ -3170,7 +3217,8 @@ case c3__i754: _set_rounding(rnd); u3_noun r_data = _soft_run(u3qi_la_linspace_i754(a, b, n, x_bloq)); - x_shape = u3nt(u3x_atom(n), 0x1, u3_nul); + if (r_data == u3_none) { return u3_none; } + x_shape = u3nc(u3x_atom(n), u3_nul); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: @@ -3210,6 +3258,7 @@ case c3__i754: _set_rounding(rnd); u3_noun r_data = _soft_run(u3qi_la_range_i754(a, b, d, x_bloq)); + if (r_data == u3_none) { return u3_none; } c3_d a_, b_, d_; c3_ds n_; switch (x_bloq) { @@ -3273,12 +3322,14 @@ x_kind = u3h(u3t(u3t(x_meta))); // 14 x_fxp = u3t(u3t(u3t(x_meta))); // 15 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(x_kind) + c3n == u3ud(x_kind) || + c3n == _check(cor) ) { return u3m_bail(c3__exit); } else { u3_noun r_data = _soft_run(u3qi_la_diag(x_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } c3_d len_x0 = _get_dims(x_shape)[0]; return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); } @@ -3313,6 +3364,7 @@ switch (x_kind) { case c3__i754: { u3_noun r_data = _soft_run(u3qi_la_trace_i754(x_data, x_shape, x_bloq)); + if (r_data == u3_none) { return u3_none; } return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} default: @@ -3353,7 +3405,9 @@ y_fxp = u3t(u3t(u3t(y_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == u3r_sing(x_bloq, y_bloq) || - c3n == u3r_sing(x_kind, y_kind) + c3n == u3r_sing(x_kind, y_kind) || + c3n == _check(u3nc(x_meta, x_data)) || + c3n == _check(u3nc(y_meta, y_data)) // fxp does not need to match so no check ) { diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index 19335435f7..7e790b21f5 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2169,7 +2169,7 @@ static u3j_core _139_non__la_core_d[] = { "cumsum", 7, _139_non__lagoon_cumsum_a, 0, no_hashes }, { "argmin", 7, _139_non__lagoon_argmin_a, 0, no_hashes }, { "argmax", 7, _139_non__lagoon_argmax_a, 0, no_hashes }, - // { "ravel", 7, _139_non__lagoon_ravel_a, 0, no_hashes }, + { "ravel", 7, _139_non__lagoon_ravel_a, 0, no_hashes }, { "min", 7, _139_non__lagoon_min_a, 0, no_hashes }, { "max", 7, _139_non__lagoon_max_a, 0, no_hashes }, { "linspace", 7, _139_non__lagoon_linspace_a, 0, no_hashes }, From 3e081bddd0e9ab379c8d7251dfbe5377f5dc5ed1 Mon Sep 17 00:00:00 2001 From: Santeri Hannula Date: Tue, 30 Jul 2024 13:25:32 +0300 Subject: [PATCH 69/97] pier: fix double boot protection on fresh boot --- pkg/vere/pier.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pkg/vere/pier.c b/pkg/vere/pier.c index 4dab2fbd3c..9c8ac0a4ec 100644 --- a/pkg/vere/pier.c +++ b/pkg/vere/pier.c @@ -585,11 +585,9 @@ _czar_boot_data(c3_c* czar_c, &czar_lyf, &czar_bon, &czar_ack)) && (c3y == u3r_safe_word(czar_glx, czar_glx_w)) && (c3y == u3r_safe_word(czar_ryf, czar_ryf_w)) && - (c3y == u3r_safe_word(czar_lyf, czar_lyf_w)) && - (c3y == u3du(czar_bon)) && - (c3y == u3r_safe_word(u3t(czar_bon), czar_bon_w)) && - (c3y == u3du(czar_ack)) && - (c3y == u3r_safe_word(u3t(czar_ack), czar_ack_w)) ) { + (c3y == u3r_safe_word(czar_lyf, czar_lyf_w)) ) { + if ( c3y == u3du(czar_bon) ) u3r_safe_word(u3t(czar_bon), czar_bon_w); + if ( c3y == u3du(czar_ack) ) u3r_safe_word(u3t(czar_ack), czar_ack_w); ret_o = c3y; } From a3363eafdb8dada91464195516668ce6120e69fb Mon Sep 17 00:00:00 2001 From: Santeri Hannula Date: Tue, 30 Jul 2024 13:45:25 +0300 Subject: [PATCH 70/97] pier: tweak double-boot protection messages --- pkg/vere/pier.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/pkg/vere/pier.c b/pkg/vere/pier.c index 9c8ac0a4ec..bd3190c8b1 100644 --- a/pkg/vere/pier.c +++ b/pkg/vere/pier.c @@ -629,27 +629,32 @@ _boot_scry_cb(void* vod_p, u3_noun nun) &czar_glx_w, &czar_ryf_w, &czar_lyf_w, &czar_bon_w, &czar_ack_w) ) { - u3l_log("boot: peer-state unvailable on czar, cannot protect from double boot"); + u3l_log("boot: peer-state unvailable on czar, cannot protect from double-boot"); _pier_work(wok_u); } else { if ( czar_ryf_w == ryf_w ) { c3_w ack_w = cur_w - 1; if ( czar_ack_w == 0xFFFFFFFF ) { // This codepath should never be hit - u3l_log("boot: message-sink-state unvailable on czar, cannot protect from double boot"); + u3l_log("boot: message-sink-state unvailable on czar, cannot protect from double-boot"); _pier_work(wok_u); } else if ( (czar_ack_w == ack_w) || ((nex_w > cur_w) && (czar_ack_w - 1 == ack_w)) ) { _pier_work(wok_u); } else { - u3l_log("boot: failed: czar last ack: %d, ship last ack: %d", - czar_ack_w, ack_w); + u3l_log("boot: failed: double-boot detected, refusing to boot %s\r\n" + "this pier is an old copy, boot the latest pier or breach\r\n" + "read more: https://docs.urbit.org/glossary/double-boot", + who_c); u3_king_bail(); } } else { // Trying to boot old ship after breach - u3l_log("boot: failed: rift in czar peer-state: %d, current rift: %d", - czar_ryf_w, ryf_w); + u3l_log("boot: failed: double-boot detected, refusing to boot %s\r\n" + "this ship has been breached since its initialization, " + "boot the latest pier or breach again\r\n" + "read more: https://docs.urbit.org/glossary/double-boot", + who_c); u3_king_bail(); } } @@ -674,14 +679,17 @@ _boot_scry_cb(void* vod_p, u3_noun nun) c3_free(czar_c); u3_weak kf_ryf = wok_u->pir_u->ryf; if ( kf_ryf == u3_none ) { - u3l_log("boot: keyfile rift unavailable, cannot protect from double boot"); + u3l_log("boot: keyfile rift unavailable, cannot protect from double-boot"); _pier_work(wok_u); } else if ( kf_ryf > czar_ryf_w ) { // Ship has breached, continue boot _pier_work(wok_u); } else { - u3l_log("boot: failed: rift in czar peer state: %d, keyfile rift: %d", - czar_ryf_w, kf_ryf); + u3l_log("boot: failed: double-boot detected, refusing to boot %s\r\n" + "this ship has already been booted elsewere, " + "boot the existing pier or breach\r\n" + "read more: https://docs.urbit.org/glossary/double-boot", + who_c); u3_king_bail(); } } @@ -690,7 +698,7 @@ _boot_scry_cb(void* vod_p, u3_noun nun) * Boot scry endpoint doesn't exists. Most likely old arvo. * Continue boot and hope for the best. */ - u3l_log("boot: %%boot scry endpoint doesn't exist, cannot protect from double boot"); + u3l_log("boot: %%boot scry endpoint doesn't exist, cannot protect from double-boot"); _pier_work(wok_u); } u3z(nun); u3z(who); From 0c3ed2a5da49123567ecf759072c39ffb48cc5f1 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Tue, 30 Jul 2024 18:14:52 -0400 Subject: [PATCH 71/97] http: slice in arvo, simplify --- pkg/vere/io/http.c | 228 +++++---------------------------------------- 1 file changed, 25 insertions(+), 203 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index af0dbf2ee9..a5b0ae99a8 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -642,35 +642,6 @@ _http_seq_new(u3_hcon* hon_u, h2o_req_t* rec_u) static void _http_cache_respond(u3_hreq* req_u, u3_noun nun); -static void -_http_scry_respond(u3_hreq* req_u, u3_noun nun); - -/* _content_headers: create content headers for response -*/ -static u3_noun -_content_headers(c3_z beg_z, c3_z end_z, c3_w tot_w) -{ - u3_noun out; - u3_noun byz; - u3_noun rng; - u3_noun len; - - u3_noun lin = u3i_list(u3i_string("bytes "), - u3do("crip", u3do("a-co:co", beg_z)), - c3_s1('-'), - u3do("crip", u3do("a-co:co", end_z)), - c3_s1('/'), - u3do("crip", u3do("a-co:co", tot_w)), - u3_none); - u3_atom dat = u3qc_rap(3, lin); - rng = u3nc(u3i_string("content-range"), dat); - len = u3nc(u3i_string("content-length"), - u3do("crip", u3do("a-co:co", (end_z - beg_z) + 1))); - - out = u3i_list(rng, len, u3_none); - return out; -} - typedef struct _range_request { c3_z beg_z; c3_z end_z; @@ -689,7 +660,7 @@ _parse_range(c3_c* txt_c, c3_w len_w) if ( hep_c ) { cut.beg_z = h2o_strtosize(txt_c, hep_c - txt_c); cut.end_z = h2o_strtosize(hep_c + 1, len_w - ((hep_c + 1) - txt_c)); - // strange -> [SIZE_MAX SIZE_MAX] so we return u3_nul in _slice_octs + // strange -> [SIZE_MAX SIZE_MAX] so we return u3_nul if ( ((SIZE_MAX == cut.beg_z) && (hep_c != txt_c)) || ((SIZE_MAX == cut.end_z) && (len_w - ((hep_c + 1) - txt_c) > 0)) ) { @@ -723,105 +694,6 @@ _get_range(h2o_headers_t req_headers, range_request* rng_req) return c3y; } -typedef struct _content { - c3_z beg_z; - c3_z end_z; - u3_noun dat; // XX free -} content; - -/* _slice_octs: given a valid range, slice a section of octs -*/ -static content -_slice_octs(range_request rng, u3_noun octs) -{ - content out; - out.beg_z = SIZE_MAX; - out.end_z = SIZE_MAX; - out.dat = u3_nul; - - u3_noun len = u3h(octs); - c3_w len_w; - if ( c3n == u3r_safe_word(len, &len_w) ) { - return out; - } - - if ( (SIZE_MAX == rng.beg_z) - && (SIZE_MAX == rng.end_z) ) - { - // [~ ~] - return out; - } - - if ( SIZE_MAX == rng.end_z ) { - // [@ ~] - out.beg_z = rng.beg_z; - out.end_z = len_w - 1; - } - else if ( SIZE_MAX == rng.beg_z ) { - // [~ @] - out.beg_z = len_w - c3_min(rng.end_z, len_w); - out.end_z = len_w - 1; - } - else { - // [@ @] - out.beg_z = rng.beg_z; - out.end_z = c3_min(rng.end_z, len_w - 1); - } - - if ( (out.beg_z < len_w) - && (out.end_z < len_w) - && (out.beg_z <= out.end_z) ) - { - out.dat = u3nc((out.end_z - out.beg_z) + 1, - u3qc_cut(3, out.beg_z, (out.end_z + 1) - out.beg_z, u3t(octs))); - } - return out; -} - -static void -_http_range_respond(u3_hreq* req_u, u3_noun nac, range_request rng_req) -{ - u3_noun octs = u3r_at(127, nac); - if ( u3_none == octs ) { - // 400 - c3_c* msg_c = "bad request"; - h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); - return; - } - content result = _slice_octs(rng_req, octs); - - if ( u3_nul == result.dat ) { - // 416 - c3_c* msg_c = "Requested Range Not Satisfiable"; - u3z(result.dat); - h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); - return; - } - - if ( c3y == u3r_sing(result.dat, octs) ) { - // 200 - u3z(result.dat); - u3_noun len = u3r_at(254, nac); - c3_w len_w; - if ( c3n == u3r_safe_word(len, &len_w) ) { - u3z(nac); - _http_scry_respond(req_u, u3_nul); - return; - } - u3_noun hez = _content_headers(0, (len_w - 1), len_w); - u3_noun res = u3i_edit(nac, 125, u3qb_weld(hez, u3r_at(125, nac))); - _http_cache_respond(req_u, res); - } - else { - // 206 - u3_noun hez = _content_headers(result.beg_z, result.end_z, u3h(octs)); - u3_noun res = u3i_edit(nac, 127, result.dat); - res = u3i_edit(res, 124, 206); - res = u3i_edit(res, 125, u3qb_weld(hez, u3r_at(125, res))); - _http_cache_respond(req_u, res); - } -} - /* _http_scry_cb() */ static void @@ -836,31 +708,18 @@ _http_scry_cb(void* vod_p, u3_noun nun) if ( req_u ) { u3_assert(u3_rsat_peek == req_u->sat_e); req_u->peq_u = 0; - if ( u3_nul == nun ) { - _http_scry_respond(req_u, u3k(nun)); - } - else { - h2o_headers_t req_headers = req_u->rec_u->headers; - range_request rng_req; - c3_o rng_o = _get_range(req_headers, &rng_req); - - if (c3y == rng_o ) { - _http_range_respond(req_u, nun, rng_req); - } - else { - _http_cache_respond(req_u, u3k(nun)); - } - } + _http_cache_respond(req_u, u3k(nun)); } // cache only if peek was not at now, and nun isn't u3_nul - if ( c3n == peq_u->las_o ) { - // XX pair of auth & path for key - // check ~watter-parter's + if ( (c3n == peq_u->las_o) + && (u3_nul != nun) ) + { u3_noun key = u3nc(auth, u3k(peq_u->pax)); u3h_put(htd_u->nax_p, key, nun); u3z(key); } + u3z(peq_u->pax); c3_free(peq_u); } @@ -1029,12 +888,27 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) } else { - u3_noun spur = u3nc(u3i_string("mime"), u3t(bem.pur)); + h2o_headers_t req_headers = req_u->rec_u->headers; + range_request rng_req; + c3_o rng_o = _get_range(req_headers, &rng_req); + + u3_noun spur; + if ( c3n == rng_o ) { + spur = u3nq(u3i_string("range"), c3_s1('0'), u3_blip, u3t(bem.pur)); + } + else { + u3_atom beg = ( SIZE_MAX == rng_req.beg_z) ? + u3_blip : u3dc("scot", c3__ud, u3i_chub(rng_req.beg_z)); + u3_atom end = ( SIZE_MAX == rng_req.end_z) ? + u3_blip : u3dc("scot", c3__ud, u3i_chub(rng_req.end_z)); + + spur = u3nq(u3i_string("range"), beg, end, u3t(bem.pur)); + } + if ( c3y == req_u->peq_u->las_o ) { u3_pier_peek_last(htd_u->car_u.pir_u, gang, c3__ex, bem.des, spur, req_u->peq_u, _http_scry_cb); } - else { u3_noun bam = u3nq(bem.who, bem.des, bem.cas, spur); u3_noun key = u3nc(auth, u3k(bam)); @@ -1045,22 +919,13 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) || (u3_nul == nac) || ((u3_nul == gang) && (c3y == u3r_at(14, nac))) ) { - // cache, then serve subsequent range requests from cache + // maybe cache, then serve subsequent range requests from cache req_u->peq_u->pax = u3k(bam); u3_pier_peek(htd_u->car_u.pir_u, gang, u3nt(0, c3__ex, bam), req_u->peq_u, _http_scry_cb); } else { - h2o_headers_t req_headers = req_u->rec_u->headers; - range_request rng_req; - c3_o rng_o = _get_range(req_headers, &rng_req); - - if ( c3y == rng_o) { - _http_range_respond(req_u, nac, rng_req); - } - else { - _http_cache_respond(req_u, nac); - } + _http_cache_respond(req_u, nac); } } } @@ -1068,49 +933,6 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) } } -static void -_http_scry_respond(u3_hreq* req_u, u3_noun nun) -{ - h2o_req_t* rec_u = req_u->rec_u; - u3_httd* htd_u = req_u->hon_u->htp_u->htd_u; - - if ( u3_nul == nun ) { - u3_weak req = _http_rec_to_httq(rec_u); - if ( u3_none == req ) { - if ( (u3C.wag_w & u3o_verbose) ) { - u3l_log("strange %.*s request", (c3_i)rec_u->method.len, - rec_u->method.base); - } - c3_c* msg_c = "bad request"; - h2o_send_error_generic(rec_u, 400, msg_c, msg_c, 0); - } - else { - h2o_send_error_500(rec_u, "Internal Server Error", "scry failed", 0); - } - } - else if ( u3_none == u3r_at(15, nun) ) { - h2o_send_error_500(rec_u, "Internal Server Error", "scry failed", 0); - } - else { - u3_noun auth, response_header, data; - u3x_qual(u3k(u3t(u3t(nun))), &auth, 0, &response_header, &data); - u3_noun status, headers; - u3x_cell(response_header, &status, &headers); - - // check auth - if ( (c3y == auth) - && (c3n == _http_req_is_auth(&htd_u->fig_u, rec_u)) ) - { - h2o_send_error_403(rec_u, "Unauthorized", "unauthorized", 0); - } - else { - req_u->sat_e = u3_rsat_plan; - _http_start_respond(req_u, u3k(status), u3k(headers), u3k(data), c3y); - } - } - u3z(nun); -} - /* _http_cache_respond(): respond with a simple-payload:http */ static void From 64ecf0cf7e74c4e80bd709617d800b4f26bd2932 Mon Sep 17 00:00:00 2001 From: Tinnus Napbus Date: Wed, 31 Jul 2024 22:26:19 +1200 Subject: [PATCH 72/97] dill: send %born event on start --- pkg/vere/io/term.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pkg/vere/io/term.c b/pkg/vere/io/term.c index bf9789aabc..b5e75e2569 100644 --- a/pkg/vere/io/term.c +++ b/pkg/vere/io/term.c @@ -1590,7 +1590,19 @@ u3_term_wall(u3_noun wol) static void _term_io_talk(u3_auto* car_u) { + //TODO reevaluate wrt dill sessions + // + u3_noun wir = u3nt(c3__term, '1', u3_nul); + u3_noun cad; + + // send born event + { + cad = u3nc(c3__born, u3_nul); + _term_ovum_plan(car_u, u3k(wir), cad); + } + if ( c3n == u3_Host.ops_u.tem ) { + u3z(wir); u3_utty* uty_u = _term_main(); uv_read_start((uv_stream_t*)&(uty_u->pin_u), @@ -1598,11 +1610,6 @@ _term_io_talk(u3_auto* car_u) _term_read_cb); } - //TODO reevaluate wrt dill sessions - // - u3_noun wir = u3nt(c3__term, '1', u3_nul); - u3_noun cad; - // send terminal dimensions // { From 13e08dbcac9e51f25bff0e347aecaf8386afd27e Mon Sep 17 00:00:00 2001 From: Sigilante Date: Wed, 31 Jul 2024 12:40:42 -0500 Subject: [PATCH 73/97] Post corrected traversal order. --- pkg/noun/jets/i/lagoon.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index 01e8e9954a..64447176fb 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -516,8 +516,8 @@ case 4: { float16_t sum16[2]; sum16[0] = (float16_t){SB_REAL16_ZERO}; - for (c3_d i = 0; i < len_x; i++) { - sum16[0] = f16_add(sum16[0], ((float16_t*)x_bytes)[i]); + for (c3_d i = len_x; i > 0; i--) { + sum16[0] = f16_add(sum16[0], ((float16_t*)x_bytes)[i-1]); } sum16[1].v = 0x1; r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)sum16); @@ -526,8 +526,8 @@ case 5: { float32_t sum32[2]; sum32[0] = (float32_t){SB_REAL32_ZERO}; - for (c3_d i = 0; i < len_x; i++) { - sum32[0] = f32_add(sum32[0], ((float32_t*)x_bytes)[i]); + for (c3_d i = len_x; i > 0; i--) { + sum32[0] = f32_add(sum32[0], ((float32_t*)x_bytes)[i-1]); } sum32[1].v = 0x1; r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)sum32); @@ -536,8 +536,8 @@ case 6: { float64_t sum64[2]; sum64[0] = (float64_t){SB_REAL64_ZERO}; - for (c3_d i = 0; i < len_x; i++) { - sum64[0] = f64_add(sum64[0], ((float64_t*)x_bytes)[i]); + for (c3_d i = len_x; i > 0; i--) { + sum64[0] = f64_add(sum64[0], ((float64_t*)x_bytes)[i-1]); } sum64[1].v = 0x1; r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)sum64); @@ -546,8 +546,8 @@ case 7: { float128_t sum128[2]; sum128[0] = (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; - for (c3_d i = 0; i < len_x; i++) { - f128M_add(&(sum128[0]), &(((float128_t*)x_bytes)[i]), &(sum128[0])); + for (c3_d i = len_x; i > 0; i--) { + f128M_add(&(sum128[0]), &(((float128_t*)x_bytes)[i-1]), &(sum128[0])); } sum128[1] = (float128_t){0x1, 0x0}; r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)sum128); @@ -1883,8 +1883,9 @@ for (c3_d i = 1; i < n-1; i++) { ((float16_t*)x_bytes16)[i] = f16_add(a16, f16_mul(i32_to_f16(i), interval16)); } - ((float16_t*)x_bytes16)[0] = a16; + // Assign in reverse order so that n=1 case is correctly left-hand bound. ((float16_t*)x_bytes16)[n-1] = b16; + ((float16_t*)x_bytes16)[0] = a16; x_bytes16[n*2] = 0x1; // pin head r_data = u3i_bytes((n*2+1)*sizeof(c3_y), x_bytes16); u3a_free(x_bytes16); @@ -1900,8 +1901,8 @@ for (c3_d i = 1; i < n-1; i++) { ((float32_t*)x_bytes32)[i] = f32_add(a32, f32_mul(i32_to_f32(i), interval32)); } - ((float32_t*)x_bytes32)[0] = a32; ((float32_t*)x_bytes32)[n-1] = b32; + ((float32_t*)x_bytes32)[0] = a32; x_bytes32[n*4] = 0x1; // pin head r_data = u3i_bytes((n*4+1)*sizeof(c3_y), x_bytes32); u3a_free(x_bytes32); @@ -1917,8 +1918,8 @@ for (c3_d i = 1; i < n-1; i++) { ((float64_t*)x_bytes64)[i] = f64_add(a64, f64_mul(i32_to_f64(i), interval64)); } - ((float64_t*)x_bytes64)[0] = a64; ((float64_t*)x_bytes64)[n-1] = b64; + ((float64_t*)x_bytes64)[0] = a64; x_bytes64[n*8] = 0x1; // pin head r_data = u3i_bytes((n*8+1)*sizeof(c3_y), x_bytes64); u3a_free(x_bytes64); @@ -1941,8 +1942,8 @@ f128M_mul(&i128, &interval128, &((float128_t*)x_bytes128)[i]); f128M_add(&a128, &((float128_t*)x_bytes128)[i], &((float128_t*)x_bytes128)[i]); } - ((float128_t*)x_bytes128)[0] = a128; ((float128_t*)x_bytes128)[n-1] = b128; + ((float128_t*)x_bytes128)[0] = a128; x_bytes128[n*16] = 0x1; // pin head r_data = u3i_bytes((n*16+1)*sizeof(c3_y), x_bytes128); u3a_free(x_bytes128); @@ -2460,7 +2461,7 @@ _set_rounding(rnd); u3_noun r_data = _soft_run(u3qi_la_cumsum_i754(x_data, x_shape, x_bloq)); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3nc(0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); default: return u3_none; @@ -3208,7 +3209,9 @@ x_fxp = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(x_kind) + c3n == u3ud(x_kind) || + c3n == u3ud(n) || + (n < 1) // crash on zero size ) { return u3m_bail(c3__exit); From 0ac53bdeaeda4a8cddb16cbcff14d99c46eb00dc Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Wed, 31 Jul 2024 20:17:29 -0400 Subject: [PATCH 74/97] http: scry & cache chunks, cleanup --- pkg/vere/io/http.c | 109 +++++++++++++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 39 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index a5b0ae99a8..482fb976d7 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -642,53 +642,71 @@ _http_seq_new(u3_hcon* hon_u, h2o_req_t* rec_u) static void _http_cache_respond(u3_hreq* req_u, u3_noun nun); -typedef struct _range_request { +typedef struct _byte_range { c3_z beg_z; c3_z end_z; -} range_request; +} byte_range; -/* _parse_range: get a range from '-' delimited text +/* _chunk_align(): align range to a nearby chunk */ -static range_request +static void +_chunk_align(byte_range* rng_u) +{ + c3_z siz_z = 4194304; // 4MiB + + if ( SIZE_MAX != rng_u->beg_z ) { + c3_z tmp_z = rng_u->beg_z / siz_z; + rng_u->beg_z = tmp_z * siz_z; + rng_u->end_z = ((tmp_z + 1) * siz_z) - 1; + } + else if ( SIZE_MAX != rng_u->end_z ) { + // round to multiple of siz_z + rng_u->end_z = siz_z * ((rng_u->end_z / siz_z) + 1); + } +} + +/* _parse_range(): get a range from '-' delimited text +*/ +static byte_range _parse_range(c3_c* txt_c, c3_w len_w) { c3_c* hep_c = memchr(txt_c, '-', len_w); - range_request cut; - cut.beg_z = SIZE_MAX; - cut.end_z = SIZE_MAX; + byte_range rng_u; + rng_u.beg_z = SIZE_MAX; + rng_u.end_z = SIZE_MAX; if ( hep_c ) { - cut.beg_z = h2o_strtosize(txt_c, hep_c - txt_c); - cut.end_z = h2o_strtosize(hep_c + 1, len_w - ((hep_c + 1) - txt_c)); - // strange -> [SIZE_MAX SIZE_MAX] so we return u3_nul - if ( ((SIZE_MAX == cut.beg_z) && (hep_c != txt_c)) - || ((SIZE_MAX == cut.end_z) && (len_w - ((hep_c + 1) - txt_c) > 0)) ) + rng_u.beg_z = h2o_strtosize(txt_c, hep_c - txt_c); + rng_u.end_z = h2o_strtosize(hep_c + 1, len_w - ((hep_c + 1) - txt_c)); + // strange -> [SIZE_MAX SIZE_MAX] + if ( ((SIZE_MAX == rng_u.beg_z) && (hep_c != txt_c)) + || ((SIZE_MAX == rng_u.end_z) && (len_w - ((hep_c + 1) - txt_c) > 0)) ) { - cut.beg_z = SIZE_MAX; - cut.end_z = SIZE_MAX; + rng_u.beg_z = SIZE_MAX; + rng_u.end_z = SIZE_MAX; } } - return cut; + return rng_u; } static c3_o -_get_range(h2o_headers_t req_headers, range_request* rng_req) +_get_range(h2o_headers_t req_headers, byte_range* rng_u) { - rng_req->beg_z = SIZE_MAX; - rng_req->end_z = SIZE_MAX; + rng_u->beg_z = SIZE_MAX; + rng_u->end_z = SIZE_MAX; - c3_w idx = h2o_find_header(&req_headers, H2O_TOKEN_RANGE, -1); - if (idx == UINT32_MAX) { + c3_w inx_w = h2o_find_header(&req_headers, H2O_TOKEN_RANGE, -1); + if (inx_w == UINT32_MAX) { return c3n; } - if ( (req_headers.entries[idx].value.len >= 6) - && (0 == memcmp("bytes=", req_headers.entries[idx].value.base, 6)) ) + if ( (req_headers.entries[inx_w].value.len >= 6) + && (0 == memcmp("bytes=", req_headers.entries[inx_w].value.base, 6)) ) { - range_request tmp = _parse_range(req_headers.entries[idx].value.base + 6, - req_headers.entries[idx].value.len - 6); - rng_req->beg_z = tmp.beg_z; - rng_req->end_z = tmp.end_z; + byte_range tmp_u = _parse_range(req_headers.entries[inx_w].value.base + 6, + req_headers.entries[inx_w].value.len - 6); + rng_u->beg_z = tmp_u.beg_z; + rng_u->end_z = tmp_u.end_z; } return c3y; @@ -797,9 +815,8 @@ _get_beam(u3_hreq* req_u, c3_c* txt_c, c3_w len_w) nex_c = fas_c; } if ( !nex_c ) { - c3_c* msg_c = "bad beam"; - // h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); - // return; + // XX bad beam + *wer = u3_nul; } else { c3_w dif_w = (c3_p)(nex_c - txt_c); @@ -864,8 +881,6 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) u3_hfig* fig_u = &req_u->hon_u->htp_u->htd_u->fig_u; h2o_req_t* rec_u = req_u->rec_u; - u3_noun our = u3dc("scot", 'p', u3i_chubs(2, htd_u->car_u.pir_u->who_d)); - // set gang to [~ ~] or ~ u3_noun gang; c3_o auth = _http_req_is_auth(fig_u, rec_u); @@ -876,35 +891,51 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) gang = u3_nul; } + u3_noun our = u3dc("scot", 'p', u3i_chubs(2, htd_u->car_u.pir_u->who_d)); + // XX weak beam? beam bem = _get_beam(req_u, bas_c, len_w); - // XX necessary? if ( (u3_nul == bem.pur) || (c3n == u3r_sing(our, bem.who)) ) { - c3_c* msg_c = "bad scry path"; + c3_c* msg_c = "bad request"; h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); return; } else { h2o_headers_t req_headers = req_u->rec_u->headers; - range_request rng_req; - c3_o rng_o = _get_range(req_headers, &rng_req); + byte_range rng_u; + c3_o rng_o = _get_range(req_headers, &rng_u); + // prepare spur for eyre range scry + // u3_noun spur; if ( c3n == rng_o ) { + // full range: '/range/0//foo' spur = u3nq(u3i_string("range"), c3_s1('0'), u3_blip, u3t(bem.pur)); } else { - u3_atom beg = ( SIZE_MAX == rng_req.beg_z) ? - u3_blip : u3dc("scot", c3__ud, u3i_chub(rng_req.beg_z)); - u3_atom end = ( SIZE_MAX == rng_req.end_z) ? - u3_blip : u3dc("scot", c3__ud, u3i_chub(rng_req.end_z)); + if ( (SIZE_MAX == rng_u.beg_z) + && (SIZE_MAX == rng_u.end_z) ) + { + c3_c* msg_c = "Requested Range Not Satisfiable"; + h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); + return; + } + + _chunk_align(&rng_u); + + u3_atom beg = ( SIZE_MAX == rng_u.beg_z) ? + u3_blip : u3dc("scot", c3__ud, u3i_chub(rng_u.beg_z)); + u3_atom end = ( SIZE_MAX == rng_u.end_z) ? + u3_blip : u3dc("scot", c3__ud, u3i_chub(rng_u.end_z)); spur = u3nq(u3i_string("range"), beg, end, u3t(bem.pur)); } + // peek or respond from cache + // if ( c3y == req_u->peq_u->las_o ) { u3_pier_peek_last(htd_u->car_u.pir_u, gang, c3__ex, bem.des, spur, req_u->peq_u, _http_scry_cb); From b07bf086fd2af2693867b7c33efbacfec0bcb6be Mon Sep 17 00:00:00 2001 From: Tinnus Napbus Date: Thu, 1 Aug 2024 17:48:46 +1200 Subject: [PATCH 75/97] dill: revise born event logic --- pkg/vere/io/term.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/vere/io/term.c b/pkg/vere/io/term.c index b5e75e2569..60fa6a68fa 100644 --- a/pkg/vere/io/term.c +++ b/pkg/vere/io/term.c @@ -1590,26 +1590,26 @@ u3_term_wall(u3_noun wol) static void _term_io_talk(u3_auto* car_u) { + if ( c3n == u3_Host.ops_u.tem ) { + u3_utty* uty_u = _term_main(); + + uv_read_start((uv_stream_t*)&(uty_u->pin_u), + _term_alloc, + _term_read_cb); + } + //TODO reevaluate wrt dill sessions // u3_noun wir = u3nt(c3__term, '1', u3_nul); u3_noun cad; // send born event + // { cad = u3nc(c3__born, u3_nul); _term_ovum_plan(car_u, u3k(wir), cad); } - if ( c3n == u3_Host.ops_u.tem ) { - u3z(wir); - u3_utty* uty_u = _term_main(); - - uv_read_start((uv_stream_t*)&(uty_u->pin_u), - _term_alloc, - _term_read_cb); - } - // send terminal dimensions // { From 95b2e285bde364a3e911f4aa2c1f1f2e48775d77 Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Thu, 1 Aug 2024 13:47:11 -0400 Subject: [PATCH 76/97] http: weak spur, refs, cleanup --- pkg/vere/io/http.c | 205 +++++++++++++++++++++++++++------------------ 1 file changed, 125 insertions(+), 80 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 482fb976d7..5bf8fcc169 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -642,6 +642,9 @@ _http_seq_new(u3_hcon* hon_u, h2o_req_t* rec_u) static void _http_cache_respond(u3_hreq* req_u, u3_noun nun); +static void +_http_scry_respond(u3_hreq* req_u, u3_noun nun); + typedef struct _byte_range { c3_z beg_z; c3_z end_z; @@ -655,12 +658,11 @@ _chunk_align(byte_range* rng_u) c3_z siz_z = 4194304; // 4MiB if ( SIZE_MAX != rng_u->beg_z ) { - c3_z tmp_z = rng_u->beg_z / siz_z; - rng_u->beg_z = tmp_z * siz_z; - rng_u->end_z = ((tmp_z + 1) * siz_z) - 1; + rng_u->beg_z = (rng_u->beg_z / siz_z) * siz_z; + rng_u->end_z = (rng_u->beg_z + siz_z) - 1; } else if ( SIZE_MAX != rng_u->end_z ) { - // round to multiple of siz_z + // round up to multiple of siz_z rng_u->end_z = siz_z * ((rng_u->end_z / siz_z) + 1); } } @@ -696,7 +698,7 @@ _get_range(h2o_headers_t req_headers, byte_range* rng_u) rng_u->end_z = SIZE_MAX; c3_w inx_w = h2o_find_header(&req_headers, H2O_TOKEN_RANGE, -1); - if (inx_w == UINT32_MAX) { + if ( UINT32_MAX == inx_w) { return c3n; } @@ -726,7 +728,7 @@ _http_scry_cb(void* vod_p, u3_noun nun) if ( req_u ) { u3_assert(u3_rsat_peek == req_u->sat_e); req_u->peq_u = 0; - _http_cache_respond(req_u, u3k(nun)); + _http_scry_respond(req_u, u3k(nun)); } // cache only if peek was not at now, and nun isn't u3_nul @@ -737,6 +739,9 @@ _http_scry_cb(void* vod_p, u3_noun nun) u3h_put(htd_u->nax_p, key, nun); u3z(key); } + else { + u3z(nun); + } u3z(peq_u->pax); c3_free(peq_u); @@ -746,18 +751,25 @@ typedef struct _beam { u3_noun who; u3_noun des; u3_noun cas; - u3_noun pur; + u3_weak pur; } beam; +static void +_free_beam(beam* bem) +{ + u3z(bem->who); + u3z(bem->des); + u3z(bem->cas); + u3z(bem->pur); +} + /* _get_beam: path to beam */ static beam _get_beam(u3_hreq* req_u, c3_c* txt_c, c3_w len_w) { beam bem; - u3_http* htp_u = req_u->hon_u->htp_u; - u3_httd* htd_u = htp_u->htd_u; - u3_noun our = u3dc("scot", 'p', u3i_chubs(2, htd_u->car_u.pir_u->who_d)); + // get beak from path // for ( c3_w i_w = 0; i_w < 3; ++i_w ) { @@ -789,10 +801,12 @@ _get_beam(u3_hreq* req_u, c3_c* txt_c, c3_w len_w) // '=' if ( (len_w > 0) && ('=' == txt_c[0]) ) { if ( 0 == i_w ) { - *wer = our; + u3_http* htp_u = req_u->hon_u->htp_u; + u3_httd* htd_u = htp_u->htd_u; + *wer = u3dc("scot", 'p', u3i_chubs(2, htd_u->car_u.pir_u->who_d)); } else if ( 1 == i_w ) { - *wer = u3i_string("base"); + *wer = c3__base; } else { req_u->peq_u->las_o = c3y; @@ -805,15 +819,14 @@ _get_beam(u3_hreq* req_u, c3_c* txt_c, c3_w len_w) c3_c* nex_c; c3_c* tis_c = memchr(txt_c, '=', len_w); c3_c* fas_c = memchr(txt_c, '/', len_w); + if ( tis_c && fas_c ) { nex_c = c3_min(tis_c, fas_c); } - else if ( tis_c ) { - nex_c = tis_c; - } else { - nex_c = fas_c; + nex_c = ( tis_c ) ? tis_c : fas_c; } + if ( !nex_c ) { // XX bad beam *wer = u3_nul; @@ -827,7 +840,9 @@ _get_beam(u3_hreq* req_u, c3_c* txt_c, c3_w len_w) } } - bem.pur = u3dc("rush", u3i_bytes(len_w, (const c3_y*)txt_c), u3v_wish("stap")); + u3_noun tmp = u3dc("rush", u3i_bytes(len_w, (const c3_y*)txt_c), u3v_wish("stap")); + bem.pur = ( u3_nul == tmp ) ? u3_none : u3k(u3t(tmp)); + u3z(tmp); return bem; } @@ -843,13 +858,17 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) { u3_http* htp_u = req_u->hon_u->htp_u; u3_httd* htd_u = htp_u->htd_u; - u3_noun wir = _http_req_to_duct(req_u); - u3_noun cad; c3_c* bas_c = req_u->rec_u->input.path.base; c3_w len_w = req_u->rec_u->input.path.len; + // check if base url starts with '/_~_/' + if ( (len_w < 6) + || (0 != memcmp("/_~_/", bas_c, 5)) ) { + // no: inject to arvo + u3_noun wir = _http_req_to_duct(req_u); + u3_noun cad; u3_noun adr = u3nc(c3__ipv4, u3i_words(1, &req_u->hon_u->ipf_w)); // XX loopback automatically secure too? // @@ -858,13 +877,6 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) cad = ( c3y == req_u->hon_u->htp_u->lop ) ? u3nc(u3i_string("request-local"), dat) : u3nc(u3i_string("request"), dat); - } - - // check if base url starts with '/_~_/' - if ( (len_w < 6) - || (0 != memcmp("/_~_/", bas_c, 5)) ) - { - // no: inject to arvo u3_auto_plan(&htd_u->car_u, u3_ovum_init(0, c3__e, wir, cad)); } else { @@ -891,75 +903,65 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) gang = u3_nul; } - u3_noun our = u3dc("scot", 'p', u3i_chubs(2, htd_u->car_u.pir_u->who_d)); - // XX weak beam? beam bem = _get_beam(req_u, bas_c, len_w); - - if ( (u3_nul == bem.pur) - || (c3n == u3r_sing(our, bem.who)) ) - { + if ( u3_none == bem.pur ) { c3_c* msg_c = "bad request"; h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); + u3z(gang); + _free_beam(&bem); return; } - else { - h2o_headers_t req_headers = req_u->rec_u->headers; - byte_range rng_u; - c3_o rng_o = _get_range(req_headers, &rng_u); + h2o_headers_t req_headers = req_u->rec_u->headers; + byte_range rng_u; + c3_o rng_o = _get_range(req_headers, &rng_u); - // prepare spur for eyre range scry - // - u3_noun spur; - if ( c3n == rng_o ) { - // full range: '/range/0//foo' - spur = u3nq(u3i_string("range"), c3_s1('0'), u3_blip, u3t(bem.pur)); - } - else { - if ( (SIZE_MAX == rng_u.beg_z) - && (SIZE_MAX == rng_u.end_z) ) - { - c3_c* msg_c = "Requested Range Not Satisfiable"; - h2o_send_error_generic(req_u->rec_u, 416, msg_c, msg_c, 0); - return; - } + // prepare spur for eyre range scry + // + u3_noun spur; + if ( c3n == rng_o ) { + // full range: '/range/0//foo' + spur = u3nq(u3i_string("range"), c3_s1('0'), u3_blip, u3k(bem.pur)); + } + else { + _chunk_align(&rng_u); - _chunk_align(&rng_u); + u3_atom beg = ( SIZE_MAX == rng_u.beg_z) ? + u3_blip : u3dc("scot", c3__ud, u3i_chub(rng_u.beg_z)); + u3_atom end = ( SIZE_MAX == rng_u.end_z) ? + u3_blip : u3dc("scot", c3__ud, u3i_chub(rng_u.end_z)); - u3_atom beg = ( SIZE_MAX == rng_u.beg_z) ? - u3_blip : u3dc("scot", c3__ud, u3i_chub(rng_u.beg_z)); - u3_atom end = ( SIZE_MAX == rng_u.end_z) ? - u3_blip : u3dc("scot", c3__ud, u3i_chub(rng_u.end_z)); + spur = u3nq(u3i_string("range"), beg, end, u3k(bem.pur)); + } - spur = u3nq(u3i_string("range"), beg, end, u3t(bem.pur)); - } + // peek or respond from cache + // + if ( c3y == req_u->peq_u->las_o ) { + u3_pier_peek_last(htd_u->car_u.pir_u, gang, c3__ex, + u3k(bem.des), spur, req_u->peq_u, _http_scry_cb); + } + else { + u3_noun bam = u3nq(u3k(bem.who), u3k(bem.des), u3k(bem.cas), spur); + u3_noun key = u3nc(auth, u3k(bam)); + u3_weak nac = u3h_get(htd_u->nax_p, key); + u3z(key); - // peek or respond from cache - // - if ( c3y == req_u->peq_u->las_o ) { - u3_pier_peek_last(htd_u->car_u.pir_u, gang, c3__ex, - bem.des, spur, req_u->peq_u, _http_scry_cb); + if ( (u3_none == nac) + || ((u3_nul == gang) && (c3y == u3r_at(14, nac))) ) + { + // maybe cache, then serve subsequent range requests from cache + req_u->peq_u->pax = u3k(bam); + u3_pier_peek(htd_u->car_u.pir_u, gang, u3nt(0, c3__ex, bam), + req_u->peq_u, _http_scry_cb); + u3z(nac); } else { - u3_noun bam = u3nq(bem.who, bem.des, bem.cas, spur); - u3_noun key = u3nc(auth, u3k(bam)); - u3_weak nac = u3h_get(htd_u->nax_p, key); - u3z(key); - - if ( (u3_none == nac) - || (u3_nul == nac) - || ((u3_nul == gang) && (c3y == u3r_at(14, nac))) ) - { - // maybe cache, then serve subsequent range requests from cache - req_u->peq_u->pax = u3k(bam); - u3_pier_peek(htd_u->car_u.pir_u, gang, u3nt(0, c3__ex, bam), - req_u->peq_u, _http_scry_cb); - } - else { - _http_cache_respond(req_u, nac); - } + _http_scry_respond(req_u, nac); + u3z(bam); + u3z(gang); } } + _free_beam(&bem); } } } @@ -1010,6 +1012,49 @@ _http_cache_respond(u3_hreq* req_u, u3_noun nun) u3z(nun); } +static void +_http_scry_respond(u3_hreq* req_u, u3_noun nun) +{ + h2o_req_t* rec_u = req_u->rec_u; + u3_httd* htd_u = req_u->hon_u->htp_u->htd_u; + + if ( u3_nul == nun ) { + u3_weak req = _http_rec_to_httq(rec_u); + if ( u3_none == req ) { + if ( (u3C.wag_w & u3o_verbose) ) { + u3l_log("strange %.*s request", (c3_i)rec_u->method.len, + rec_u->method.base); + } + c3_c* msg_c = "bad request"; + h2o_send_error_generic(rec_u, 400, msg_c, msg_c, 0); + } + else { + h2o_send_error_500(rec_u, "Internal Server Error", "scry failed", 0); + } + } + else if ( u3_none == u3r_at(7, nun) ) { + h2o_send_error_500(rec_u, "Internal Server Error", "scry failed", 0); + } + else { + u3_noun auth, response_header, data; + u3x_qual(u3k(u3t(u3t(nun))), &auth, 0, &response_header, &data); + u3_noun status, headers; + u3x_cell(response_header, &status, &headers); + + // check auth + if ( (c3y == auth) + && (c3n == _http_req_is_auth(&htd_u->fig_u, rec_u)) ) + { + h2o_send_error_403(rec_u, "Unauthorized", "unauthorized", 0); + } + else { + req_u->sat_e = u3_rsat_plan; + _http_start_respond(req_u, u3k(status), u3k(headers), u3k(data), c3y); + } + } + u3z(nun); +} + /* _http_cache_scry_cb(): insert scry result into noun cache */ static void From 840760d003b66602a640f45308610fca302e2847 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Thu, 1 Aug 2024 18:16:11 -0500 Subject: [PATCH 77/97] Post second review tweaks. --- pkg/noun/jets/i/lagoon.c | 374 +++++++++++++-------------------------- 1 file changed, 127 insertions(+), 247 deletions(-) diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index 64447176fb..5da333ecf0 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -30,16 +30,6 @@ c3_d c[2]; }; -/* soft check on u3_none return from q jet -*/ - static inline u3_noun _soft_run(u3_noun a) - { - if (u3_none == a) { - u3m_bail(c3__fail); - } - return a; - } - // $?(%n %u %d %z %a) static inline void _set_rounding(c3_w a) @@ -2161,31 +2151,22 @@ u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || + c3n == u3r_sing(x_meta, y_meta) || c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, - y_shape, y_bloq, y_kind, y_fxp, + u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 - y_shape = u3h(y_meta); // 2 - y_bloq = u3h(u3t(y_meta)); // 6 - y_kind = u3h(u3t(u3t(y_meta))); // 14 - y_fxp = u3t(u3t(u3t(y_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(y_bloq) || c3n == u3ud(x_kind) || - c3n == u3ud(y_kind) || - c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(x_bloq, y_bloq) || - c3n == u3r_sing(x_kind, y_kind) - // fxp does not need to match here so no check + c3n == u3ud(rnd) ) { return u3m_bail(c3__exit); @@ -2193,9 +2174,9 @@ switch (x_kind) { case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_add_i754(x_data, y_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_add_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: return u3_none; @@ -2217,31 +2198,22 @@ u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || + c3n == u3r_sing(x_meta, y_meta) || c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, - y_shape, y_bloq, y_kind, y_fxp, + u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 - y_shape = u3h(y_meta); // 2 - y_bloq = u3h(u3t(y_meta)); // 6 - y_kind = u3h(u3t(u3t(y_meta))); // 14 - y_fxp = u3t(u3t(u3t(y_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(y_bloq) || c3n == u3ud(x_kind) || - c3n == u3ud(y_kind) || - c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(x_bloq, y_bloq) || - c3n == u3r_sing(x_kind, y_kind) - // fxp does not need to match here so no check + c3n == u3ud(rnd) ) { return u3m_bail(c3__exit); @@ -2249,9 +2221,9 @@ switch (x_kind) { case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_sub_i754(x_data, y_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_sub_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: return u3_none; @@ -2273,31 +2245,22 @@ u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || + c3n == u3r_sing(x_meta, y_meta) || c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, - y_shape, y_bloq, y_kind, y_fxp, + u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 - y_shape = u3h(y_meta); // 2 - y_bloq = u3h(u3t(y_meta)); // 6 - y_kind = u3h(u3t(u3t(y_meta))); // 14 - y_fxp = u3t(u3t(u3t(y_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(y_bloq) || c3n == u3ud(x_kind) || - c3n == u3ud(y_kind) || - c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(x_bloq, y_bloq) || - c3n == u3r_sing(x_kind, y_kind) - // fxp does not need to match here so no check + c3n == u3ud(rnd) ) { return u3m_bail(c3__exit); @@ -2305,9 +2268,9 @@ switch (x_kind) { case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_mul_i754(x_data, y_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_mul_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: return u3_none; @@ -2329,31 +2292,22 @@ u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || + c3n == u3r_sing(x_meta, y_meta) || c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, - y_shape, y_bloq, y_kind, y_fxp, + u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 - y_shape = u3h(y_meta); // 2 - y_bloq = u3h(u3t(y_meta)); // 6 - y_kind = u3h(u3t(u3t(y_meta))); // 14 - y_fxp = u3t(u3t(u3t(y_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(y_bloq) || c3n == u3ud(x_kind) || - c3n == u3ud(y_kind) || - c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(x_bloq, y_bloq) || - c3n == u3r_sing(x_kind, y_kind) - // fxp does not need to match here so no check + c3n == u3ud(rnd) ) { return u3m_bail(c3__exit); @@ -2361,9 +2315,9 @@ switch (x_kind) { case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_div_i754(x_data, y_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_div_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: return u3_none; @@ -2385,31 +2339,22 @@ u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || + c3n == u3r_sing(x_meta, y_meta) || c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, - y_shape, y_bloq, y_kind, y_fxp, + u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 - y_shape = u3h(y_meta); // 2 - y_bloq = u3h(u3t(y_meta)); // 6 - y_kind = u3h(u3t(u3t(y_meta))); // 14 - y_fxp = u3t(u3t(u3t(y_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(y_bloq) || c3n == u3ud(x_kind) || - c3n == u3ud(y_kind) || - c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(x_bloq, y_bloq) || - c3n == u3r_sing(x_kind, y_kind) - // fxp does not need to match here so no check + c3n == u3ud(rnd) ) { return u3m_bail(c3__exit); @@ -2417,9 +2362,9 @@ switch (x_kind) { case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_mod_i754(x_data, y_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_mod_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: return u3_none; @@ -2442,12 +2387,12 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, + u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == u3ud(x_bloq) || c3n == u3ud(x_kind) || @@ -2459,9 +2404,9 @@ switch (x_kind) { case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_cumsum_i754(x_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_cumsum_i754(x_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3nc(0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3nc(0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: return u3_none; @@ -2484,11 +2429,10 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp; + u3_noun x_shape, x_bloq, x_kind; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 if ( c3n == u3ud(x_bloq) || c3n == u3ud(x_kind) || c3n == _check(u3nc(x_meta, x_data)) @@ -2498,7 +2442,7 @@ } else { switch (x_kind) { case c3__i754: { - u3_noun r_data = _soft_run(u3qi_la_argmin_i754(x_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_argmin_i754(x_data, x_shape, x_bloq); // bare atom (@ index) return r_data;} @@ -2523,7 +2467,7 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp; + u3_noun x_shape, x_bloq, x_kind; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 @@ -2536,7 +2480,7 @@ } else { switch (x_kind) { case c3__i754: { - u3_noun r_data = _soft_run(u3qi_la_ravel_i754(x_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_ravel_i754(x_data, x_shape, x_bloq); // (list @) return r_data;} @@ -2561,11 +2505,10 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp; + u3_noun x_shape, x_bloq, x_kind; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 if ( c3n == u3ud(x_bloq) || c3n == u3ud(x_kind) || c3n == _check(u3nc(x_meta, x_data)) @@ -2575,7 +2518,7 @@ } else { switch (x_kind) { case c3__i754: { - u3_noun r_data = _soft_run(u3qi_la_argmax_i754(x_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_argmax_i754(x_data, x_shape, x_bloq); // bare atom (@ index) return r_data;} @@ -2600,11 +2543,11 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp; + u3_noun x_shape, x_bloq, x_kind, x_tail; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 if ( c3n == u3ud(x_bloq) || c3n == u3ud(x_kind) || c3n == _check(u3nc(x_meta, x_data)) @@ -2614,9 +2557,9 @@ } else { switch (x_kind) { case c3__i754: { - u3_noun r_data = _soft_run(u3qi_la_min_i754(x_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_min_i754(x_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);} default: return u3_none; @@ -2639,11 +2582,11 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp; + u3_noun x_shape, x_bloq, x_kind, x_tail; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 if ( c3n == u3ud(x_bloq) || c3n == u3ud(x_kind) || c3n == _check(u3nc(x_meta, x_data)) @@ -2653,9 +2596,9 @@ } else { switch (x_kind) { case c3__i754: { - u3_noun r_data = _soft_run(u3qi_la_max_i754(x_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_max_i754(x_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);} default: return u3_none; @@ -2678,11 +2621,11 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp; + u3_noun x_shape, x_bloq, x_kind, x_tail; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 if ( c3n == u3ud(x_bloq) || c3n == u3ud(x_kind) ) @@ -2691,9 +2634,9 @@ } else { switch (x_kind) { case c3__i754: { - u3_noun r_data = _soft_run(u3qi_la_abs_i754(x_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_abs_i754(x_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);} default: return u3_none; @@ -2715,39 +2658,27 @@ u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || + c3n == u3r_sing(x_meta, y_meta) || c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, - y_shape, y_bloq, y_kind, y_fxp, - rnd; + u3_noun x_shape, x_bloq, x_kind; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 - y_shape = u3h(y_meta); // 2 - y_bloq = u3h(u3t(y_meta)); // 6 - y_kind = u3h(u3t(u3t(y_meta))); // 14 - y_fxp = u3t(u3t(u3t(y_meta))); // 15 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(y_bloq) || - c3n == u3ud(x_kind) || - c3n == u3ud(y_kind) || - c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(x_bloq, y_bloq) || - c3n == u3r_sing(x_kind, y_kind) || - c3n == u3r_sing(x_fxp, y_fxp) + c3n == u3ud(x_kind) ) { return u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: { - u3_noun r_data = _soft_run(u3qi_la_gth_i754(x_data, y_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_gth_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} + return u3nc(u3k(x_meta), r_data);} default: return u3_none; @@ -2769,39 +2700,27 @@ u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || + c3n == u3r_sing(x_meta, y_meta) || c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, - y_shape, y_bloq, y_kind, y_fxp, - rnd; + u3_noun x_shape, x_bloq, x_kind; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 - y_shape = u3h(y_meta); // 2 - y_bloq = u3h(u3t(y_meta)); // 6 - y_kind = u3h(u3t(u3t(y_meta))); // 14 - y_fxp = u3t(u3t(u3t(y_meta))); // 15 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(y_bloq) || - c3n == u3ud(x_kind) || - c3n == u3ud(y_kind) || - c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(x_bloq, y_bloq) || - c3n == u3r_sing(x_kind, y_kind) || - c3n == u3r_sing(x_fxp, y_fxp) + c3n == u3ud(x_kind) ) { return u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: { - u3_noun r_data = _soft_run(u3qi_la_gte_i754(x_data, y_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_gte_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} + return u3nc(u3k(x_meta), r_data);} default: return u3_none; @@ -2823,39 +2742,27 @@ u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || + c3n == u3r_sing(x_meta, y_meta) || c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, - y_shape, y_bloq, y_kind, y_fxp, - rnd; + u3_noun x_shape, x_bloq, x_kind; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 - y_shape = u3h(y_meta); // 2 - y_bloq = u3h(u3t(y_meta)); // 6 - y_kind = u3h(u3t(u3t(y_meta))); // 14 - y_fxp = u3t(u3t(u3t(y_meta))); // 15 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(y_bloq) || - c3n == u3ud(x_kind) || - c3n == u3ud(y_kind) || - c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(x_bloq, y_bloq) || - c3n == u3r_sing(x_kind, y_kind) || - c3n == u3r_sing(x_fxp, y_fxp) + c3n == u3ud(x_kind) ) { return u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: { - u3_noun r_data = _soft_run(u3qi_la_lth_i754(x_data, y_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_lth_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} + return u3nc(u3k(x_meta), r_data);} default: return u3_none; @@ -2877,39 +2784,27 @@ u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || + c3n == u3r_sing(x_meta, y_meta) || c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, - y_shape, y_bloq, y_kind, y_fxp, - rnd; + u3_noun x_shape, x_bloq, x_kind; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 - y_shape = u3h(y_meta); // 2 - y_bloq = u3h(u3t(y_meta)); // 6 - y_kind = u3h(u3t(u3t(y_meta))); // 14 - y_fxp = u3t(u3t(u3t(y_meta))); // 15 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(y_bloq) || - c3n == u3ud(x_kind) || - c3n == u3ud(y_kind) || - c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(x_bloq, y_bloq) || - c3n == u3r_sing(x_kind, y_kind) || - c3n == u3r_sing(x_fxp, y_fxp) + c3n == u3ud(x_kind) ) { return u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: { - u3_noun r_data = _soft_run(u3qi_la_lte_i754(x_data, y_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_lte_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} + return u3nc(u3k(x_meta), r_data);} default: return u3_none; @@ -2934,19 +2829,19 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, + u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 switch (x_kind) { case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_adds_i754(x_data, n, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_adds_i754(x_data, n, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: return u3_none; @@ -2970,19 +2865,19 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, + u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 switch (x_kind) { case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_subs_i754(x_data, n, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_subs_i754(x_data, n, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: return u3_none; @@ -3006,19 +2901,19 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, + u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 switch (x_kind) { case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_muls_i754(x_data, n, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_muls_i754(x_data, n, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: return u3_none; @@ -3042,19 +2937,19 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, + u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 switch (x_kind) { case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_divs_i754(x_data, n, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_divs_i754(x_data, n, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: return u3_none; @@ -3078,19 +2973,19 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, + u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 switch (x_kind) { case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_mods_i754(x_data, n, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_mods_i754(x_data, n, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: return u3_none; @@ -3111,31 +3006,21 @@ u3x_sam_6, &y_meta, u3x_sam_7, &y_data, 0) || + c3n == u3r_sing(x_meta, y_meta) || c3n == u3ud(x_data) || c3n == u3ud(y_data) ) { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, - y_shape, y_bloq, y_kind, y_fxp, + u3_noun x_shape, x_bloq, x_kind, x_tail, rnd; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 - y_shape = u3h(y_meta); // 2 - y_bloq = u3h(u3t(y_meta)); // 6 - y_kind = u3h(u3t(u3t(y_meta))); // 14 - y_fxp = u3t(u3t(u3t(y_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == u3ud(x_bloq) || - c3n == u3ud(y_bloq) || - c3n == u3ud(x_kind) || - c3n == u3ud(y_kind) || - c3n == u3r_sing(x_shape, y_shape) || - c3n == u3r_sing(x_bloq, y_bloq) || - c3n == u3r_sing(x_kind, y_kind) || - c3n == u3r_sing(x_fxp, y_fxp) + c3n == u3ud(x_kind) ) { return u3m_bail(c3__exit); @@ -3143,10 +3028,10 @@ switch (x_kind) { case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_dot_i754(x_data, y_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_dot_i754(x_data, y_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } c3_d len_x0 = _get_dims(x_shape)[0]; - return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: return u3_none; @@ -3169,11 +3054,11 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp; + u3_noun x_shape, x_bloq, x_kind, x_tail; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 if ( c3n == u3ud(x_bloq) || c3n == u3ud(x_kind) || c3n == _check(cor) @@ -3181,9 +3066,9 @@ { return u3m_bail(c3__exit); } else { - u3_noun r_data = _soft_run(u3qi_la_transpose(x_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_transpose(x_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3nt(u3k(u3h(x_shape)), u3k(u3h(u3t(x_shape))), u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3nt(u3k(u3h(x_shape)), u3k(u3h(u3t(x_shape))), u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); } } } @@ -3202,11 +3087,11 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp; + u3_noun x_shape, x_bloq, x_kind, x_tail; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == u3ud(x_bloq) || c3n == u3ud(x_kind) || @@ -3219,10 +3104,10 @@ switch (x_kind) { case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_linspace_i754(a, b, n, x_bloq)); + u3_noun r_data = u3qi_la_linspace_i754(a, b, n, x_bloq); if (r_data == u3_none) { return u3_none; } x_shape = u3nc(u3x_atom(n), u3_nul); - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: return u3_none; @@ -3245,11 +3130,11 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp; + u3_noun x_shape, x_bloq, x_kind, x_tail; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == u3ud(x_bloq) || c3n == u3ud(x_kind) @@ -3260,7 +3145,7 @@ switch (x_kind) { case c3__i754: _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_range_i754(a, b, d, x_bloq)); + u3_noun r_data = u3qi_la_range_i754(a, b, d, x_bloq); if (r_data == u3_none) { return u3_none; } c3_d a_, b_, d_; c3_ds n_; @@ -3289,14 +3174,14 @@ u3r_bytes(0, 16, (c3_y*)&b__, b); u3r_bytes(0, 16, (c3_y*)&d__, d); float128_t tmp; - f128M_sub((float128_t*){&b__}, (float128_t*){&a__}, &tmp); - f128M_div(&tmp, (float128_t*){&d__}, &tmp); + f128M_sub((float128_t*)&b__, (float128_t*)&a__, &tmp); + f128M_div(&tmp, (float128_t*)&d__, &tmp); n_ = f128M_to_i64(&tmp, softfloat_round_minMag, false); break;} } u3_noun n = u3i_chub(n_+1); x_shape = u3nt(u3k(n), 0x1, u3_nul); - return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: return u3_none; @@ -3319,11 +3204,11 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp; + u3_noun x_shape, x_bloq, x_kind, x_tail; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 if ( c3n == u3ud(x_bloq) || c3n == u3ud(x_kind) || c3n == _check(cor) @@ -3331,10 +3216,10 @@ { return u3m_bail(c3__exit); } else { - u3_noun r_data = _soft_run(u3qi_la_diag(x_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_diag(x_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } c3_d len_x0 = _get_dims(x_shape)[0]; - return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data); + return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); } } } @@ -3353,12 +3238,12 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp; + u3_noun x_shape, x_bloq, x_kind, x_tail; if ( c3n == u3r_mean(x_meta, 2, &x_shape, 6, &x_bloq, 14, &x_kind, - 15, &x_fxp, + 15, &x_tail, 0) ) { @@ -3366,9 +3251,9 @@ } else { switch (x_kind) { case c3__i754: { - u3_noun r_data = _soft_run(u3qi_la_trace_i754(x_data, x_shape, x_bloq)); + u3_noun r_data = u3qi_la_trace_i754(x_data, x_shape, x_bloq); if (r_data == u3_none) { return u3_none; } - return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_fxp)), r_data);} + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);} default: return u3_none; @@ -3383,7 +3268,7 @@ // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, y_meta, y_data; - + fprintf(stderr, "mmul 1\n"); if ( c3n == u3r_mean(cor, u3x_sam_4, &x_meta, u3x_sam_5, &x_data, @@ -3395,31 +3280,26 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_fxp, - y_shape, y_bloq, y_kind, y_fxp, + u3_noun x_shape, x_bloq, x_kind, x_tail, + y_shape, rnd; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_fxp = u3t(u3t(u3t(x_meta))); // 15 + x_tail = u3t(u3t(u3t(x_meta))); // 15 y_shape = u3h(y_meta); // 2 - y_bloq = u3h(u3t(y_meta)); // 6 - y_kind = u3h(u3t(u3t(y_meta))); // 14 - y_fxp = u3t(u3t(u3t(y_meta))); // 15 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 - if ( c3n == u3r_sing(x_bloq, y_bloq) || - c3n == u3r_sing(x_kind, y_kind) || - c3n == _check(u3nc(x_meta, x_data)) || + if ( c3n == _check(u3nc(x_meta, x_data)) || c3n == _check(u3nc(y_meta, y_data)) - // fxp does not need to match so no check ) { return u3m_bail(c3__exit); } else { switch (x_kind) { case c3__i754: + fprintf(stderr, "mmul 2\n"); _set_rounding(rnd); - u3_noun r_data = _soft_run(u3qi_la_mmul_i754(x_data, y_data, x_shape, y_shape, x_bloq)); + u3_noun r_data = u3qi_la_mmul_i754(x_data, y_data, x_shape, y_shape, x_bloq); // result is already [meta data] return r_data; From 354852befb6be54dd77182f8617b376727c3542e Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Fri, 2 Aug 2024 19:37:57 -0400 Subject: [PATCH 78/97] http: weaker spur, chunk edge case, cleanup --- pkg/vere/io/http.c | 52 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 5bf8fcc169..834843effd 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -658,8 +658,19 @@ _chunk_align(byte_range* rng_u) c3_z siz_z = 4194304; // 4MiB if ( SIZE_MAX != rng_u->beg_z ) { - rng_u->beg_z = (rng_u->beg_z / siz_z) * siz_z; - rng_u->end_z = (rng_u->beg_z + siz_z) - 1; + if ( rng_u->beg_z > rng_u->end_z ) { + rng_u->beg_z = SIZE_MAX; + rng_u->end_z = SIZE_MAX; + } + else { + // XX an out-of-bounds request could be aligned to in-bounds + // resulting in a 200 or 206 response instead of 416. + // browsers should have the total length from content-range, + // and send reasonable range requests. + // + rng_u->beg_z = (rng_u->beg_z / siz_z) * siz_z; + rng_u->end_z = (rng_u->beg_z + siz_z) - 1; + } } else if ( SIZE_MAX != rng_u->end_z ) { // round up to multiple of siz_z @@ -682,7 +693,8 @@ _parse_range(c3_c* txt_c, c3_w len_w) rng_u.end_z = h2o_strtosize(hep_c + 1, len_w - ((hep_c + 1) - txt_c)); // strange -> [SIZE_MAX SIZE_MAX] if ( ((SIZE_MAX == rng_u.beg_z) && (hep_c != txt_c)) - || ((SIZE_MAX == rng_u.end_z) && (len_w - ((hep_c + 1) - txt_c) > 0)) ) + || ((SIZE_MAX == rng_u.end_z) && (len_w - ((hep_c + 1) - txt_c) > 0)) + || ((SIZE_MAX != rng_u.beg_z) && (rng_u.beg_z > rng_u.end_z)) ) { rng_u.beg_z = SIZE_MAX; rng_u.end_z = SIZE_MAX; @@ -691,6 +703,8 @@ _parse_range(c3_c* txt_c, c3_w len_w) return rng_u; } +/* _get_range(): get a _byte_range from headers +*/ static c3_o _get_range(h2o_headers_t req_headers, byte_range* rng_u) { @@ -714,7 +728,7 @@ _get_range(h2o_headers_t req_headers, byte_range* rng_u) return c3y; } -/* _http_scry_cb() +/* _http_scry_cb(): respond and maybe cache scry result */ static void _http_scry_cb(void* vod_p, u3_noun nun) @@ -747,13 +761,17 @@ _http_scry_cb(void* vod_p, u3_noun nun) c3_free(peq_u); } +/* _beam: ship desk case spur +*/ typedef struct _beam { - u3_noun who; - u3_noun des; - u3_noun cas; + u3_weak who; + u3_weak des; + u3_weak cas; u3_weak pur; } beam; +/* _free_beam(): free a beam +*/ static void _free_beam(beam* bem) { @@ -763,14 +781,14 @@ _free_beam(beam* bem) u3z(bem->pur); } -/* _get_beam: path to beam +/* _get_beam(): get a _beam from url */ static beam _get_beam(u3_hreq* req_u, c3_c* txt_c, c3_w len_w) { beam bem; - // get beak from path + // get beak // for ( c3_w i_w = 0; i_w < 3; ++i_w ) { u3_noun* wer; @@ -798,6 +816,7 @@ _get_beam(u3_hreq* req_u, c3_c* txt_c, c3_w len_w) txt_c++; len_w--; } + // '=' if ( (len_w > 0) && ('=' == txt_c[0]) ) { if ( 0 == i_w ) { @@ -828,8 +847,8 @@ _get_beam(u3_hreq* req_u, c3_c* txt_c, c3_w len_w) } if ( !nex_c ) { - // XX bad beam - *wer = u3_nul; + *wer = u3_none; + return bem; } else { c3_w dif_w = (c3_p)(nex_c - txt_c); @@ -840,6 +859,7 @@ _get_beam(u3_hreq* req_u, c3_c* txt_c, c3_w len_w) } } + // get spur u3_noun tmp = u3dc("rush", u3i_bytes(len_w, (const c3_y*)txt_c), u3v_wish("stap")); bem.pur = ( u3_nul == tmp ) ? u3_none : u3k(u3t(tmp)); u3z(tmp); @@ -867,7 +887,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) || (0 != memcmp("/_~_/", bas_c, 5)) ) { // no: inject to arvo - u3_noun wir = _http_req_to_duct(req_u); + u3_noun wir = _http_req_to_duct(req_u); u3_noun cad; u3_noun adr = u3nc(c3__ipv4, u3i_words(1, &req_u->hon_u->ipf_w)); // XX loopback automatically secure too? @@ -904,7 +924,11 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) } beam bem = _get_beam(req_u, bas_c, len_w); - if ( u3_none == bem.pur ) { + if ( (u3_none == bem.who) + || (u3_none == bem.des) + || (u3_none == bem.cas) + || (u3_none == bem.pur) ) + { c3_c* msg_c = "bad request"; h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); u3z(gang); @@ -1012,6 +1036,8 @@ _http_cache_respond(u3_hreq* req_u, u3_noun nun) u3z(nun); } +/* _http_scry_respond(): respond with a simple-payload:http +*/ static void _http_scry_respond(u3_hreq* req_u, u3_noun nun) { From a0fd92197b29d62a2c20d834d212d2379a35cafe Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Mon, 5 Aug 2024 12:30:48 -0400 Subject: [PATCH 79/97] http: fix possible bail foul --- pkg/vere/io/http.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 834843effd..0584930e36 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -909,6 +909,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) req_u->peq_u->htd_u = htd_u; req_u->peq_u->las_o = c3n; req_u->sat_e = u3_rsat_peek; + req_u->peq_u->pax = u3_nul; u3_hfig* fig_u = &req_u->hon_u->htp_u->htd_u->fig_u; h2o_req_t* rec_u = req_u->rec_u; @@ -932,6 +933,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) c3_c* msg_c = "bad request"; h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); u3z(gang); + u3z(req_u->peq_u->pax); _free_beam(&bem); return; } @@ -974,6 +976,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) || ((u3_nul == gang) && (c3y == u3r_at(14, nac))) ) { // maybe cache, then serve subsequent range requests from cache + u3z(req_u->peq_u->pax); req_u->peq_u->pax = u3k(bam); u3_pier_peek(htd_u->car_u.pir_u, gang, u3nt(0, c3__ex, bam), req_u->peq_u, _http_scry_cb); @@ -985,7 +988,7 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) u3z(gang); } } - _free_beam(&bem); + _free_beam(&bem); } } } From bee8af3c7adc0b431d9f8fd9a7990421d806b6fc Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Tue, 6 Aug 2024 10:53:58 -0400 Subject: [PATCH 80/97] http: fix bail meme --- pkg/vere/io/http.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 0584930e36..9f5b937511 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -1021,7 +1021,7 @@ _http_cache_respond(u3_hreq* req_u, u3_noun nun) } else { u3_noun auth, response_header, data; - u3x_qual(u3k(u3t(u3t(nun))), &auth, 0, &response_header, &data); + u3x_qual(u3t(u3t(nun)), &auth, 0, &response_header, &data); u3_noun status, headers; u3x_cell(response_header, &status, &headers); @@ -1066,7 +1066,7 @@ _http_scry_respond(u3_hreq* req_u, u3_noun nun) } else { u3_noun auth, response_header, data; - u3x_qual(u3k(u3t(u3t(nun))), &auth, 0, &response_header, &data); + u3x_qual(u3t(u3t(nun)), &auth, 0, &response_header, &data); u3_noun status, headers; u3x_cell(response_header, &status, &headers); From 88f92eeddbcb3f217ca7162baa4a03d2d5e8e63d Mon Sep 17 00:00:00 2001 From: midden-fabler Date: Tue, 6 Aug 2024 23:21:56 -0400 Subject: [PATCH 81/97] http: check ship for scry at now --- pkg/vere/io/http.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pkg/vere/io/http.c b/pkg/vere/io/http.c index 9f5b937511..094c813af6 100644 --- a/pkg/vere/io/http.c +++ b/pkg/vere/io/http.c @@ -963,8 +963,19 @@ _http_req_dispatch(u3_hreq* req_u, u3_noun req) // peek or respond from cache // if ( c3y == req_u->peq_u->las_o ) { - u3_pier_peek_last(htd_u->car_u.pir_u, gang, c3__ex, - u3k(bem.des), spur, req_u->peq_u, _http_scry_cb); + u3_noun our = u3dc("scot", 'p', u3i_chubs(2, htd_u->car_u.pir_u->who_d)); + if ( our == bem.who ) { + u3_pier_peek_last(htd_u->car_u.pir_u, gang, c3__ex, + u3k(bem.des), spur, req_u->peq_u, _http_scry_cb); + } + else { + c3_c* msg_c = "bad request"; + h2o_send_error_generic(req_u->rec_u, 400, msg_c, msg_c, 0); + u3z(gang); + u3z(spur); + u3z(req_u->peq_u->pax); + } + u3z(our); } else { u3_noun bam = u3nq(u3k(bem.who), u3k(bem.des), u3k(bem.cas), spur); From 3d462d3fa5354ef210493a9943a6b2e7b8ab8bc8 Mon Sep 17 00:00:00 2001 From: Sigilante Date: Tue, 13 Aug 2024 12:03:51 -0500 Subject: [PATCH 82/97] Add corrected range jets. --- WORKSPACE.bazel | 2 +- pkg/noun/jets/i/lagoon.c | 85 +++++++++++++++++++++------------------- 2 files changed, 45 insertions(+), 42 deletions(-) diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index ec6a637114..37d6659125 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -297,7 +297,7 @@ versioned_http_archive( strip_prefix = "SoftBLAS-{version}", # sha256 = "", url = "https://github.com/urbit/SoftBLAS/archive/{version}.tar.gz", - version = "29daa2f2fd0ad5070e405ad287f3623804f8fc67", + version = "cbffb33f19ea02f9ffbd184d445123c57929ec53", ) versioned_http_archive( diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index 5da333ecf0..13c0f2a138 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -10,6 +10,11 @@ #include // for pow() #include +#define f16_ceil(a) f16_roundToInt( a, softfloat_round_max, false ) +#define f32_ceil(a) f32_roundToInt( a, softfloat_round_max, false ) +#define f64_ceil(a) f64_roundToInt( a, softfloat_round_max, false ) +#define f128M_ceil(a, b) f128M_roundToInt( a, softfloat_round_max, false, b ) + union half { float16_t h; c3_w c; @@ -1964,15 +1969,14 @@ u3r_bytes(0, 2, (c3_y*)&(a16.v), a); u3r_bytes(0, 2, (c3_y*)&(b16.v), b); u3r_bytes(0, 2, (c3_y*)&(interval16.v), d); - c3_d n16 = f16_to_i64(f16_div(f16_sub(b16, a16), interval16), softfloat_round_minMag, false); - c3_y* x_bytes16 = (c3_y*)u3a_malloc(((n16+1)*2+1)*sizeof(c3_y)); - for (c3_d i = 1; i <= n16; i++) { - ((float16_t*)x_bytes16)[n16-i] = f16_add(a16, f16_mul(i32_to_f16(i), interval16)); - } - ((float16_t*)x_bytes16)[n16] = a16; - // ((float16_t*)x_bytes16)[0] = b16; - x_bytes16[(n16+1)*2] = 0x1; // pin head - r_data = u3i_bytes(((n16+1)*2+1)*sizeof(c3_y), x_bytes16); + c3_d n16 = f16_to_i64(f16_ceil(f16_div(f16_sub(b16, a16), interval16)), softfloat_round_minMag, false); + c3_y* x_bytes16 = (c3_y*)u3a_malloc(((n16+1)*2)*sizeof(c3_y)); + ((float16_t*)x_bytes16)[0] = a16; + for (c3_d i = 1; i < n16; i++) { + ((float16_t*)x_bytes16)[i] = f16_add(a16, f16_mul(i32_to_f16(i), interval16)); + } + ((float16_t*)x_bytes16)[n16].v = 0x1; // pin head + r_data = u3i_bytes(((n16+1)*2)*sizeof(c3_y), x_bytes16); u3a_free(x_bytes16); break;} @@ -1981,15 +1985,14 @@ u3r_bytes(0, 4, (c3_y*)&(a32.v), a); u3r_bytes(0, 4, (c3_y*)&(b32.v), b); u3r_bytes(0, 4, (c3_y*)&(interval32.v), d); - c3_d n32 = f32_to_i64(f32_div(f32_sub(b32, a32), interval32), softfloat_round_minMag, false); - c3_y* x_bytes32 = (c3_y*)u3a_malloc(((n32+1)*4+1)*sizeof(c3_y)); - for (c3_d i = 1; i <= n32; i++) { - ((float32_t*)x_bytes32)[n32-i] = f32_add(a32, f32_mul(i32_to_f32(i), interval32)); - } - ((float32_t*)x_bytes32)[n32] = a32; - // ((float32_t*)x_bytes32)[0] = b32; - x_bytes32[(n32+1)*4] = 0x1; // pin head - r_data = u3i_bytes(((n32+1)*4+1)*sizeof(c3_y), x_bytes32); + c3_d n32 = f32_to_i64(f32_ceil(f32_div(f32_sub(b32, a32), interval32)), softfloat_round_minMag, false); + c3_y* x_bytes32 = (c3_y*)u3a_malloc(((n32+1)*4)*sizeof(c3_y)); + ((float32_t*)x_bytes32)[0] = a32; + for (c3_d i = 1; i < n32; i++) { + ((float32_t*)x_bytes32)[i] = f32_add(a32, f32_mul(i32_to_f32(i), interval32)); + } + ((float32_t*)x_bytes32)[n32].v = 0x1; // pin head + r_data = u3i_bytes(((n32+1)*4)*sizeof(c3_y), x_bytes32); u3a_free(x_bytes32); break;} @@ -1998,15 +2001,14 @@ u3r_bytes(0, 8, (c3_y*)&(a64.v), a); u3r_bytes(0, 8, (c3_y*)&(b64.v), b); u3r_bytes(0, 8, (c3_y*)&(interval64.v), d); - c3_d n64 = f64_to_i64(f64_div(f64_sub(b64, a64), interval64), softfloat_round_minMag, false); - c3_y* x_bytes64 = (c3_y*)u3a_malloc(((n64+1)*8+1)*sizeof(c3_y)); + c3_d n64 = f64_to_i64(f64_ceil(f64_div(f64_sub(b64, a64), interval64)), softfloat_round_minMag, false); + c3_y* x_bytes64 = (c3_y*)u3a_malloc(((n64+1)*8)*sizeof(c3_y)); + ((float64_t*)x_bytes64)[0] = a64; for (c3_d i = 1; i < n64; i++) { - ((float64_t*)x_bytes64)[n64-i] = f64_add(a64, f64_mul(i32_to_f64(i), interval64)); + ((float64_t*)x_bytes64)[i] = f64_add(a64, f64_mul(i32_to_f64(i), interval64)); } - ((float64_t*)x_bytes64)[n64] = a64; - // ((float64_t*)x_bytes64)[0] = b64; - x_bytes64[(n64+1)*8] = 0x1; // pin head - r_data = u3i_bytes(((n64+1)*8+1)*sizeof(c3_y), x_bytes64); + ((float64_t*)x_bytes64)[n64].v = 0x1; // pin head + r_data = u3i_bytes(((n64+1)*8)*sizeof(c3_y), x_bytes64); u3a_free(x_bytes64); break;} @@ -2014,22 +2016,23 @@ float128_t a128, b128, interval128; u3r_bytes(0, 16, (c3_y*)&(a128.v[0]), a); u3r_bytes(0, 16, (c3_y*)&(b128.v[0]), b); - u3r_bytes(0, 16, (c3_y*)&(interval128.v), d); + u3r_bytes(0, 16, (c3_y*)&(interval128.v[0]), d); float128_t tmp; f128M_sub(&b128, &a128, &tmp); - f128M_div(&tmp, &interval128, &interval128); + f128M_div(&tmp, &interval128, &tmp); + f128M_ceil(&tmp, &tmp); c3_d n128 = f128M_to_i64(&tmp, softfloat_round_minMag, false); - c3_y* x_bytes128 = (c3_y*)u3a_malloc(((n128+1)*16+1)*sizeof(c3_y)); + c3_y* x_bytes128 = (c3_y*)u3a_malloc(((n128+1)*16)*sizeof(c3_y)); float128_t i128; + ((float128_t*)x_bytes128)[0] = a128; for (c3_d i = 1; i < n128; i++) { i32_to_f128M(i, &i128); - f128M_mul(&i128, &interval128, &((float128_t*)x_bytes128)[n128-i]); - f128M_add(&a128, &((float128_t*)x_bytes128)[n128-i], &((float128_t*)x_bytes128)[n128-i]); + f128M_mul(&i128, &interval128, &((float128_t*)x_bytes128)[i]); + f128M_add(&a128, &((float128_t*)x_bytes128)[i], &((float128_t*)x_bytes128)[i]); } - ((float128_t*)x_bytes128)[n128] = a128; - // ((float128_t*)x_bytes128)[0] = b128; - x_bytes128[(n128+1)*16] = 0x1; // pin head - r_data = u3i_bytes(((n128+1)*16+1)*sizeof(c3_y), x_bytes128); + ((float128_t*)x_bytes128)[n128].v[0] = 0x1; // pin head + ((float128_t*)x_bytes128)[n128].v[1] = 0x0; // pin head + r_data = u3i_bytes(((n128+1)*16)*sizeof(c3_y), x_bytes128); u3a_free(x_bytes128); break;} } @@ -3154,19 +3157,19 @@ u3r_bytes(0, 2, (c3_y*)&a_, a); u3r_bytes(0, 2, (c3_y*)&b_, b); u3r_bytes(0, 2, (c3_y*)&d_, d); - n_ = f16_to_i64(f16_div(f16_sub((float16_t){b_}, (float16_t){a_}), (float16_t){d_}), softfloat_round_minMag, false); + n_ = f16_to_i64(f16_ceil(f16_div(f16_sub((float16_t){b_}, (float16_t){a_}), (float16_t){d_})), softfloat_round_minMag, false) - 1; break; case 5: u3r_bytes(0, 4, (c3_y*)&a_, a); u3r_bytes(0, 4, (c3_y*)&b_, b); u3r_bytes(0, 4, (c3_y*)&d_, d); - n_ = f32_to_i64(f32_div(f32_sub((float32_t){b_}, (float32_t){a_}), (float32_t){d_}), softfloat_round_minMag, false); + n_ = f32_to_i64(f32_ceil(f32_div(f32_sub((float32_t){b_}, (float32_t){a_}), (float32_t){d_})), softfloat_round_minMag, false) - 1; break; case 6: u3r_bytes(0, 8, (c3_y*)&a_, a); u3r_bytes(0, 8, (c3_y*)&b_, b); u3r_bytes(0, 8, (c3_y*)&d_, d); - n_ = f64_to_i64(f64_div(f64_sub((float64_t){b_}, (float64_t){a_}), (float64_t){d_}), softfloat_round_minMag, false); + n_ = f64_to_i64(f64_ceil(f64_div(f64_sub((float64_t){b_}, (float64_t){a_}), (float64_t){d_})), softfloat_round_minMag, false) - 1; break; case 7: { c3_d a__[2], b__[2], d__[2]; @@ -3176,11 +3179,12 @@ float128_t tmp; f128M_sub((float128_t*)&b__, (float128_t*)&a__, &tmp); f128M_div(&tmp, (float128_t*)&d__, &tmp); - n_ = f128M_to_i64(&tmp, softfloat_round_minMag, false); + f128M_ceil(&tmp, &tmp); + n_ = f128M_to_i64(&tmp, softfloat_round_minMag, false) - 1; break;} } u3_noun n = u3i_chub(n_+1); - x_shape = u3nt(u3k(n), 0x1, u3_nul); + x_shape = u3nc(u3k(n), u3_nul); return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); default: @@ -3268,7 +3272,7 @@ // Each argument is a ray, [=meta data=@ux] u3_noun x_meta, x_data, y_meta, y_data; - fprintf(stderr, "mmul 1\n"); + if ( c3n == u3r_mean(cor, u3x_sam_4, &x_meta, u3x_sam_5, &x_data, @@ -3297,7 +3301,6 @@ } else { switch (x_kind) { case c3__i754: - fprintf(stderr, "mmul 2\n"); _set_rounding(rnd); u3_noun r_data = u3qi_la_mmul_i754(x_data, y_data, x_shape, y_shape, x_bloq); // result is already [meta data] From d1291e67d9ab71e3d6b78a98afd772f94f5ed6e7 Mon Sep 17 00:00:00 2001 From: pkova Date: Wed, 14 Aug 2024 19:57:17 +0300 Subject: [PATCH 83/97] manage: u3m_mark takes no arguments --- pkg/noun/manage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/noun/manage.c b/pkg/noun/manage.c index 71a1c86542..1ff748d528 100644 --- a/pkg/noun/manage.c +++ b/pkg/noun/manage.c @@ -460,7 +460,7 @@ u3m_file(c3_c* pas_c) /* u3m_mark(): mark all nouns in the road. */ u3m_quac** -u3m_mark() +u3m_mark(void) { u3m_quac** qua_u = c3_malloc(sizeof(*qua_u) * 5); qua_u[0] = u3v_mark(); @@ -1368,7 +1368,7 @@ u3m_grab(u3_noun som, ...) // terminate with u3_none // u3h_free(u3R->cax.har_p); // u3R->cax.har_p = u3h_new(); - u3m_mark(0); + u3m_mark(); { va_list vap; u3_noun tur; From cccfcd27512e0204eb220ade7fd14e5fcc48f007 Mon Sep 17 00:00:00 2001 From: pkova Date: Wed, 14 Aug 2024 19:57:37 +0300 Subject: [PATCH 84/97] allocate: mark the trace stack in u3a_mark_road --- pkg/noun/allocate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/noun/allocate.c b/pkg/noun/allocate.c index 60332b5b52..829356a96a 100644 --- a/pkg/noun/allocate.c +++ b/pkg/noun/allocate.c @@ -2191,7 +2191,7 @@ u3a_mark_road() qua_u[1] = c3_calloc(sizeof(*qua_u[1])); qua_u[1]->nam_c = strdup("trace stack"); - qua_u[1]->siz_w = u3a_mark_noun(u3R->ski.gul) * 4; + qua_u[1]->siz_w = u3a_mark_noun(u3R->bug.tax) * 4; qua_u[2] = c3_calloc(sizeof(*qua_u[2])); qua_u[2]->nam_c = strdup("trace buffer"); From 526287c2bd4d9cde7b4973cba2dc61c257510194 Mon Sep 17 00:00:00 2001 From: Pyry Kovanen Date: Wed, 9 Oct 2024 16:51:20 +0300 Subject: [PATCH 85/97] build: change mirror for libnatpmp, hopefully less flaky --- ext/natpmp/build.zig.zon | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/natpmp/build.zig.zon b/ext/natpmp/build.zig.zon index 9f1c3f12a9..76d822b995 100644 --- a/ext/natpmp/build.zig.zon +++ b/ext/natpmp/build.zig.zon @@ -3,7 +3,7 @@ .version = "0.0.1", .dependencies = .{ .natpmp = .{ - .url = "http://www.miniupnp.tuxfamily.org/files/libnatpmp-20230423.tar.gz", + .url = "https://debian.mirror.root.lu/debian/pool/main/libn/libnatpmp/libnatpmp_20230423.orig.tar.gz", .hash = "12203f777796f1df1db24c4194bcc6060d2a7bee2eea88527c2336bbf455d4108239", }, }, From 43c95d627d7b0248d093d5c3bd679d9078b5d245 Mon Sep 17 00:00:00 2001 From: Santeri Hannula Date: Thu, 10 Oct 2024 17:47:00 +0300 Subject: [PATCH 86/97] ext: fix generated gmp x86_64 linux assemby sources --- ext/gmp/build.zig | 2 +- ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s | 14 +- ext/gmp/gen/x86_64-linux/mpn/add_n.s | 207 +-- ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s | 179 +-- ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s | 194 ++- ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s | 300 ++-- ext/gmp/gen/x86_64-linux/mpn/addmul_1.s | 218 ++- ext/gmp/gen/x86_64-linux/mpn/addmul_2.s | 244 ++- ext/gmp/gen/x86_64-linux/mpn/and_n.s | 63 +- ext/gmp/gen/x86_64-linux/mpn/andn_n.s | 61 +- ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s | 77 +- ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s | 160 +- ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s | 165 +- ext/gmp/gen/x86_64-linux/mpn/com.s | 303 +--- ext/gmp/gen/x86_64-linux/mpn/copyd.s | 213 +-- ext/gmp/gen/x86_64-linux/mpn/copyi.s | 249 +-- ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s | 6 +- ext/gmp/gen/x86_64-linux/mpn/divrem_1.s | 15 - ext/gmp/gen/x86_64-linux/mpn/gcd_11.s | 164 +- ext/gmp/gen/x86_64-linux/mpn/gcd_22.s | 319 +++- ext/gmp/gen/x86_64-linux/mpn/hamdist.s | 204 +-- ext/gmp/gen/x86_64-linux/mpn/ior_n.s | 63 +- ext/gmp/gen/x86_64-linux/mpn/iorn_n.s | 61 +- ext/gmp/gen/x86_64-linux/mpn/lshift.s | 237 ++- ext/gmp/gen/x86_64-linux/mpn/lshiftc.s | 259 ++-- ext/gmp/gen/x86_64-linux/mpn/mul_1.s | 219 +-- ext/gmp/gen/x86_64-linux/mpn/mul_2.s | 190 ++- ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s | 625 ++++---- ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s | 641 ++++---- .../gen/x86_64-linux/mpn/mulmid_basecase.s | 573 +++++++ ext/gmp/gen/x86_64-linux/mpn/nand_n.s | 63 +- ext/gmp/gen/x86_64-linux/mpn/nior_n.s | 63 +- ext/gmp/gen/x86_64-linux/mpn/popcount.s | 189 +-- ext/gmp/gen/x86_64-linux/mpn/redc_1.s | 792 ++++++---- ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s | 179 +-- ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s | 194 ++- ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s | 300 ++-- ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s | 83 +- ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s | 83 +- ext/gmp/gen/x86_64-linux/mpn/rshift.s | 251 ++- ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s | 197 +-- ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s | 1372 ++++++++--------- ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s | 14 +- ext/gmp/gen/x86_64-linux/mpn/sub_n.s | 207 +-- ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s | 171 +- ext/gmp/gen/x86_64-linux/mpn/sublsh2_n.s | 190 --- ext/gmp/gen/x86_64-linux/mpn/submul_1.s | 203 ++- ext/gmp/gen/x86_64-linux/mpn/xnor_n.s | 61 +- ext/gmp/gen/x86_64-linux/mpn/xor_n.s | 63 +- 49 files changed, 5334 insertions(+), 5566 deletions(-) create mode 100644 ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s delete mode 100644 ext/gmp/gen/x86_64-linux/mpn/sublsh2_n.s diff --git a/ext/gmp/build.zig b/ext/gmp/build.zig index efa83a9895..cdfad41394 100644 --- a/ext/gmp/build.zig +++ b/ext/gmp/build.zig @@ -685,6 +685,7 @@ const x86_64_linux_asm_sources = [_][]const u8{ "gen/x86_64-linux/mpn/mul_2.s", "gen/x86_64-linux/mpn/mul_basecase.s", "gen/x86_64-linux/mpn/mullo_basecase.s", + "gen/x86_64-linux/mpn/mulmid_basecase.s", "gen/x86_64-linux/mpn/nand_n.s", "gen/x86_64-linux/mpn/nior_n.s", "gen/x86_64-linux/mpn/popcount.s", @@ -703,7 +704,6 @@ const x86_64_linux_asm_sources = [_][]const u8{ "gen/x86_64-linux/mpn/sub_err3_n.s", "gen/x86_64-linux/mpn/sub_n.s", "gen/x86_64-linux/mpn/sublsh1_n.s", - "gen/x86_64-linux/mpn/sublsh2_n.s", "gen/x86_64-linux/mpn/submul_1.s", "gen/x86_64-linux/mpn/xnor_n.s", "gen/x86_64-linux/mpn/xor_n.s", diff --git a/ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s b/ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s index 6c2ae338b4..2cbba6ad10 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s @@ -189,20 +189,20 @@ __gmpn_add_err1_n: .align 32, 0x90 .Lloop: - mov (%rsi,%r9,8), %r14 shr $1, %al mov -8(%r8), %r10 mov $0, %r13d + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 adc (%rdx,%r9,8), %r14 cmovnc %r13, %r10 - mov %r14, (%rdi,%r9,8) - mov 8(%rsi,%r9,8), %r15 - mov 16(%rsi,%r9,8), %r14 adc 8(%rdx,%r9,8), %r15 mov -16(%r8), %r11 + mov %r14, (%rdi,%r9,8) + mov 16(%rsi,%r9,8), %r14 + mov %r15, 8(%rdi,%r9,8) cmovnc %r13, %r11 mov -24(%r8), %r12 - mov %r15, 8(%rdi,%r9,8) adc 16(%rdx,%r9,8), %r14 cmovnc %r13, %r12 mov 24(%rsi,%r9,8), %r15 @@ -215,12 +215,12 @@ __gmpn_add_err1_n: adc $0, %rbp add %r12, %rbx adc $0, %rbp - lea -32(%r8), %r8 mov %r14, 16(%rdi,%r9,8) add %r13, %rbx + lea -32(%r8), %r8 adc $0, %rbp + mov %r15, 24(%rdi,%r9,8) add $4, %r9 - mov %r15, -8(%rdi,%r9,8) jnz .Lloop .Lend: diff --git a/ext/gmp/gen/x86_64-linux/mpn/add_n.s b/ext/gmp/gen/x86_64-linux/mpn/add_n.s index 400fe976ec..14cc32b0b9 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/add_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/add_n.s @@ -94,20 +94,18 @@ __gmpn_add_nc: - mov %ecx, %eax - shr $3, %rcx - and $7, %eax - - lea .Ltab(%rip), %r9 - neg %r8 + shr $2, %rcx + and $3, %eax + bt $0, %r8 + jrcxz .Llt4 - movslq (%r9,%rax,4), %rax - lea (%r9,%rax), %rax - jmp *%rax + mov (%rsi), %r8 + mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid .size __gmpn_add_nc,.-__gmpn_add_nc - .align 16, 0x90 .globl __gmpn_add_n .type __gmpn_add_n,@function @@ -115,159 +113,82 @@ __gmpn_add_nc: __gmpn_add_n: - mov %ecx, %eax - shr $3, %rcx - and $7, %eax - - lea .Ltab(%rip), %r9 + shr $2, %rcx + and $3, %eax + jrcxz .Llt4 - movslq (%r9,%rax,4), %rax - lea (%r9,%rax), %rax - jmp *%rax - - -.L0: mov (%rsi), %r8 + mov (%rsi), %r8 mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid + +.Llt4: dec %eax + mov (%rsi), %r8 + jnz .L2 adc (%rdx), %r8 - jmp .Le0 + mov %r8, (%rdi) + adc %eax, %eax + + ret -.L4: mov (%rsi), %r8 +.L2: dec %eax mov 8(%rsi), %r9 + jnz .L3 adc (%rdx), %r8 - lea -32(%rsi), %rsi - lea -32(%rdx), %rdx - lea -32(%rdi), %rdi - inc %rcx - jmp .Le4 - -.L5: mov (%rsi), %r11 - mov 8(%rsi), %r8 - mov 16(%rsi), %r9 - adc (%rdx), %r11 - lea -24(%rsi), %rsi - lea -24(%rdx), %rdx - lea -24(%rdi), %rdi - inc %rcx - jmp .Le5 - -.L6: mov (%rsi), %r10 - adc (%rdx), %r10 - mov 8(%rsi), %r11 - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi - inc %rcx - jmp .Le6 - -.L7: mov (%rsi), %r9 - mov 8(%rsi), %r10 - adc (%rdx), %r9 - adc 8(%rdx), %r10 - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi - inc %rcx - jmp .Le7 + adc 8(%rdx), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + adc %eax, %eax + + ret - .align 16, 0x90 -.Ltop: -.Le3: mov %r9, 40(%rdi) -.Le2: mov %r10, 48(%rdi) -.Le1: mov (%rsi), %r8 - mov 8(%rsi), %r9 +.L3: mov 16(%rsi), %r10 adc (%rdx), %r8 - mov %r11, 56(%rdi) - lea 64(%rdi), %rdi -.Le0: mov 16(%rsi), %r10 adc 8(%rdx), %r9 adc 16(%rdx), %r10 mov %r8, (%rdi) -.Le7: mov 24(%rsi), %r11 mov %r9, 8(%rdi) -.Le6: mov 32(%rsi), %r8 - mov 40(%rsi), %r9 - adc 24(%rdx), %r11 mov %r10, 16(%rdi) -.Le5: adc 32(%rdx), %r8 - mov %r11, 24(%rdi) -.Le4: mov 48(%rsi), %r10 - mov 56(%rsi), %r11 - mov %r8, 32(%rdi) - lea 64(%rsi), %rsi - adc 40(%rdx), %r9 - adc 48(%rdx), %r10 - adc 56(%rdx), %r11 - lea 64(%rdx), %rdx - dec %rcx - jnz .Ltop - -.Lend: mov %r9, 40(%rdi) - mov %r10, 48(%rdi) - mov %r11, 56(%rdi) - mov %ecx, %eax - adc %ecx, %eax + setc %al ret .align 16, 0x90 -.L3: mov (%rsi), %r9 - mov 8(%rsi), %r10 - mov 16(%rsi), %r11 - adc (%rdx), %r9 - adc 8(%rdx), %r10 - adc 16(%rdx), %r11 - jrcxz .Lx3 - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea -40(%rdi), %rdi - jmp .Le3 -.Lx3: mov %r9, (%rdi) - mov %r10, 8(%rdi) - mov %r11, 16(%rdi) - mov %ecx, %eax - adc %ecx, %eax - - ret +.Ltop: adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + mov %r8, (%rdi) + lea 32(%rsi), %rsi + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + dec %rcx + mov %r11, 24(%rdi) + lea 32(%rdx), %rdx + mov (%rsi), %r8 + mov 8(%rsi), %r9 + lea 32(%rdi), %rdi +.Lmid: mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + jnz .Ltop - .align 16, 0x90 -.L1: mov (%rsi), %r11 - adc (%rdx), %r11 - jrcxz .Lx1 - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea -56(%rdi), %rdi - jmp .Le1 -.Lx1: mov %r11, (%rdi) - mov %ecx, %eax - adc %ecx, %eax - - ret +.Lend: lea 32(%rsi), %rsi + adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + lea 32(%rdx), %rdx + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + mov %r11, 24(%rdi) + lea 32(%rdi), %rdi - .align 16, 0x90 -.L2: mov (%rsi), %r10 - mov 8(%rsi), %r11 - adc (%rdx), %r10 - adc 8(%rdx), %r11 - jrcxz .Lx2 - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea -48(%rdi), %rdi - jmp .Le2 -.Lx2: mov %r10, (%rdi) - mov %r11, 8(%rdi) - mov %ecx, %eax - adc %ecx, %eax + inc %eax + dec %eax + jnz .Llt4 + adc %eax, %eax ret .size __gmpn_add_n,.-__gmpn_add_n - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab - .long .L4-.Ltab - .long .L5-.Ltab - .long .L6-.Ltab - .long .L7-.Ltab diff --git a/ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s b/ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s index cac8dd4b70..e3d3aae6c0 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s @@ -46,15 +46,6 @@ - - - - - - - - - @@ -77,6 +68,9 @@ + + + @@ -86,29 +80,6 @@ .text - .align 16, 0x90 - .globl __gmpn_addlsh1_nc - .type __gmpn_addlsh1_nc,@function - -__gmpn_addlsh1_nc: - - - - push %rbp - mov %r8, %rax - neg %rax - xor %ebp, %ebp - mov (%rdx), %r8 - shrd $63, %r8, %rbp - mov %ecx, %r9d - and $3, %r9d - je .Lb00 - cmp $2, %r9d - jc .Lb01 - je .Lb10 - jmp .Lb11 - .size __gmpn_addlsh1_nc,.-__gmpn_addlsh1_nc - .align 16, 0x90 .globl __gmpn_addlsh1_n .type __gmpn_addlsh1_n,@function @@ -117,96 +88,92 @@ __gmpn_addlsh1_n: push %rbp - xor %ebp, %ebp + mov (%rdx), %r8 - shrd $63, %r8, %rbp mov %ecx, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + xor %ebp, %ebp and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - mov 16(%rdx), %r10 - shrd $63, %r10, %r9 - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - adc 16(%rsi), %r9 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, %rbp - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi +.Lb11: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 sbb %eax, %eax - sub $3, %rcx - ja .Ltop - jmp .Lend - -.Lb01: add %eax, %eax - adc (%rsi), %rbp - mov %rbp, (%rdi) - mov %r8, %rbp - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi + add (%rsi,%rcx,8), %r8 + adc 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + adc 16(%rsi,%rcx,8), %r10 + mov %r10, 16(%rdi,%rcx,8) + sbb %ebp, %ebp + add $3, %rcx + jmp .Lent + +.Lb10: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 sbb %eax, %eax - sub $1, %rcx - ja .Ltop - jmp .Lend - -.Lb10: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, %rbp - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi + add (%rsi,%rcx,8), %r8 + adc 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb %ebp, %ebp + add $2, %rcx + jmp .Lent + +.Lb01: add %r8, %r8 sbb %eax, %eax - sub $2, %rcx - ja .Ltop - jmp .Lend + add (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + sbb %ebp, %ebp + inc %rcx +.Lent: jns .Lend .align 16, 0x90 -.Ltop: mov (%rdx), %r8 - shrd $63, %r8, %rbp -.Lb00: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - mov 16(%rdx), %r10 - shrd $63, %r10, %r9 - mov 24(%rdx), %r11 - shrd $63, %r11, %r10 - lea 32(%rdx), %rdx - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - adc 16(%rsi), %r9 - adc 24(%rsi), %r10 - lea 32(%rsi), %rsi - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, 24(%rdi) - mov %r11, %rbp - lea 32(%rdi), %rdi +.Ltop: add %eax, %eax + + mov (%rdx,%rcx,8), %r8 +.Lb00: adc %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 + mov 24(%rdx,%rcx,8), %r11 + adc %r11, %r11 + sbb %eax, %eax - sub $4, %rcx - jnz .Ltop + add %ebp, %ebp + + adc (%rsi,%rcx,8), %r8 + nop + adc 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + adc 16(%rsi,%rcx,8), %r10 + adc 24(%rsi,%rcx,8), %r11 + mov %r10, 16(%rdi,%rcx,8) + mov %r11, 24(%rdi,%rcx,8) + + sbb %ebp, %ebp + add $4, %rcx + js .Ltop + +.Lend: + + add %ebp, %eax + neg %eax + -.Lend: shr $63, %rbp - add %eax, %eax - adc $0, %rbp - mov %rbp, %rax pop %rbp ret .size __gmpn_addlsh1_n,.-__gmpn_addlsh1_n - diff --git a/ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s b/ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s index 313daa83e2..00e20905cc 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s @@ -46,10 +46,13 @@ - - - - + + + + + + + @@ -87,30 +90,11 @@ - .text - .align 16, 0x90 - .globl __gmpn_addlsh2_nc - .type __gmpn_addlsh2_nc,@function - -__gmpn_addlsh2_nc: - - push %rbp - mov %r8, %rax - neg %rax - xor %ebp, %ebp - mov (%rdx), %r8 - shrd $62, %r8, %rbp - mov %ecx, %r9d - and $3, %r9d - je .Lb00 - cmp $2, %r9d - jc .Lb01 - je .Lb10 - jmp .Lb11 - .size __gmpn_addlsh2_nc,.-__gmpn_addlsh2_nc + + .text .align 16, 0x90 .globl __gmpn_addlsh2_n .type __gmpn_addlsh2_n,@function @@ -118,96 +102,102 @@ __gmpn_addlsh2_nc: __gmpn_addlsh2_n: - push %rbp - xor %ebp, %ebp + push %r12 + push %r13 + push %r14 + push %r15 + mov (%rdx), %r8 - shrd $62, %r8, %rbp + lea (,%r8,4), %r12 + shr $62, %r8 + mov %ecx, %eax - and $3, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + and $3, %al je .Lb00 - cmp $2, %eax + cmp $2, %al jc .Lb01 je .Lb10 -.Lb11: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - mov 16(%rdx), %r10 - shrd $62, %r10, %r9 - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - adc 16(%rsi), %r9 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, %rbp - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi - sbb %eax, %eax - sub $3, %rcx - ja .Ltop +.Lb11: mov 8(%rdx,%rcx,8), %r10 + lea (%r8,%r10,4), %r14 + shr $62, %r10 + mov 16(%rdx,%rcx,8), %r11 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + add (%rsi,%rcx,8), %r12 + adc 8(%rsi,%rcx,8), %r14 + adc 16(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r14, 8(%rdi,%rcx,8) + mov %r15, 16(%rdi,%rcx,8) + add $3, %rcx + js .Ltop jmp .Lend -.Lb01: add %eax, %eax - adc (%rsi), %rbp - mov %rbp, (%rdi) - mov %r8, %rbp - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - sbb %eax, %eax - sub $1, %rcx - ja .Ltop +.Lb01: mov %r8, %r11 + add (%rsi,%rcx,8), %r12 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + add $1, %rcx + js .Ltop jmp .Lend -.Lb10: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, %rbp - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi - sbb %eax, %eax - sub $2, %rcx - ja .Ltop +.Lb10: mov 8(%rdx,%rcx,8), %r11 + lea (%r8,%r11,4), %r15 + shr $62, %r11 + add (%rsi,%rcx,8), %r12 + adc 8(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r15, 8(%rdi,%rcx,8) + add $2, %rcx + js .Ltop jmp .Lend +.Lb00: mov 8(%rdx,%rcx,8), %r9 + mov 16(%rdx,%rcx,8), %r10 + jmp .Le00 + .align 16, 0x90 -.Ltop: mov (%rdx), %r8 - shrd $62, %r8, %rbp -.Lb00: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - mov 16(%rdx), %r10 - shrd $62, %r10, %r9 - mov 24(%rdx), %r11 - shrd $62, %r11, %r10 - lea 32(%rdx), %rdx - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - adc 16(%rsi), %r9 - adc 24(%rsi), %r10 - lea 32(%rsi), %rsi - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, 24(%rdi) - mov %r11, %rbp - lea 32(%rdi), %rdi - sbb %eax, %eax - sub $4, %rcx - jnz .Ltop - -.Lend: shr $62, %rbp - add %eax, %eax - adc $0, %rbp - mov %rbp, %rax - pop %rbp +.Ltop: mov 16(%rdx,%rcx,8), %r10 + mov (%rdx,%rcx,8), %r8 + mov 8(%rdx,%rcx,8), %r9 + lea (%r11,%r8,4), %r12 + shr $62, %r8 +.Le00: lea (%r8,%r9,4), %r13 + shr $62, %r9 + mov 24(%rdx,%rcx,8), %r11 + lea (%r9,%r10,4), %r14 + shr $62, %r10 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + add %eax, %eax + adc (%rsi,%rcx,8), %r12 + adc 8(%rsi,%rcx,8), %r13 + adc 16(%rsi,%rcx,8), %r14 + adc 24(%rsi,%rcx,8), %r15 + mov %r12, (%rdi,%rcx,8) + mov %r13, 8(%rdi,%rcx,8) + mov %r14, 16(%rdi,%rcx,8) + sbb %eax, %eax + mov %r15, 24(%rdi,%rcx,8) + add $4, %rcx + js .Ltop +.Lend: + + + sub %r11d, %eax + neg %eax + + pop %r15 + pop %r14 + pop %r13 + pop %r12 ret .size __gmpn_addlsh2_n,.-__gmpn_addlsh2_n diff --git a/ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s b/ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s index 00e16c8d00..2d261d5e37 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s @@ -65,32 +65,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - + @@ -103,7 +78,7 @@ .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_addlsh_n .type __gmpn_addlsh_n,@function @@ -111,142 +86,143 @@ __gmpn_addlsh_n: + push %r12 + push %rbp + push %rbx + + mov (%rdx), %rax + + mov $0, %ebp + sub %rcx, %rbp + + lea -16(%rsi,%rcx,8), %rsi + lea -16(%rdi,%rcx,8), %rdi + lea 16(%rdx,%rcx,8), %r12 + + mov %rcx, %r9 + + mov %r8, %rcx + mov $1, %r8d + shl %cl, %r8 + + mul %r8 + + and $3, %r9d + jz .Lb0 + cmp $2, %r9d + jc .Lb1 + jz .Lb2 + +.Lb3: mov %rax, %r11 + add 16(%rsi,%rbp,8), %r11 + mov -8(%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov (%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $3, %rbp + jnz .Llo3 + jmp .Lcj3 + +.Lb2: mov %rax, %rbx + mov -8(%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $2, %rbp + jz .Lcj2 + mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + xor %ecx, %ecx + jmp .Llo2 + +.Lb1: mov %rax, %r9 + mov %rdx, %r10 + add $1, %rbp + jnz .Lgt1 + add 8(%rsi,%rbp,8), %r9 + jmp .Lcj1 +.Lgt1: mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add 8(%rsi,%rbp,8), %r9 + adc 16(%rsi,%rbp,8), %r10 + adc 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + jmp .Llo1 + +.Lb0: mov %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add 16(%rsi,%rbp,8), %r10 + adc 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jz .Lend - mov (%rdx), %r10 - - mov %ecx, %eax - shr $3, %rcx - xor %r9d, %r9d - sub %r8, %r9 - and $7, %eax - - lea .Ltab(%rip), %r11 - - movslq (%r11,%rax,4), %rax - add %r11, %rax - jmp *%rax - - -.L0: lea 32(%rsi), %rsi - lea 32(%rdx), %rdx - lea 32(%rdi), %rdi - xor %r11d, %r11d - jmp .Le0 - -.L7: mov %r10, %r11 - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi - xor %r10d, %r10d - jmp .Le7 - -.L6: lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi - xor %r11d, %r11d - jmp .Le6 - -.L5: mov %r10, %r11 - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - xor %r10d, %r10d - jmp .Le5 - -.Lend: adc 24(%rsi), %rax - mov %rax, -40(%rdi) - .byte 0xc4,194,179,0xf7,195 - adc %rcx, %rax + .align 8, 0x90 +.Ltop: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Llo3: mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %r11, -8(%rdi,%rbp,8) +.Llo2: mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add %ecx, %ecx + adc (%rsi,%rbp,8), %rbx + adc 8(%rsi,%rbp,8), %r9 + adc 16(%rsi,%rbp,8), %r10 + adc 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rbx, (%rdi,%rbp,8) +.Llo1: mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov %r9, 8(%rdi,%rbp,8) +.Llo0: mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jnz .Ltop + +.Lend: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Lcj3: mov %r11, -8(%rdi,%rbp,8) +.Lcj2: add %ecx, %ecx + adc (%rsi,%rbp,8), %rbx + adc 8(%rsi,%rbp,8), %r9 + mov %rbx, (%rdi,%rbp,8) +.Lcj1: mov %r9, 8(%rdi,%rbp,8) + mov %rdx, %rax + adc $0, %rax + pop %rbx + pop %rbp + pop %r12 ret - - .align 32, 0x90 -.Ltop: jrcxz .Lend - mov -32(%rdx), %r10 - adc 24(%rsi), %rax - lea 64(%rsi), %rsi - .byte 0xc4,66,179,0xf7,219 - mov %rax, -40(%rdi) -.Le0: dec %rcx - .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov -24(%rdx), %r11 - adc -32(%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, -32(%rdi) -.Le7: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov -16(%rdx), %r10 - adc -24(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, -24(%rdi) -.Le6: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov -8(%rdx), %r11 - adc -16(%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, -16(%rdi) -.Le5: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov (%rdx), %r10 - adc -8(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, -8(%rdi) -.Le4: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov 8(%rdx), %r11 - adc (%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, (%rdi) -.Le3: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov 16(%rdx), %r10 - adc 8(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, 8(%rdi) -.Le2: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov 24(%rdx), %r11 - adc 16(%rsi), %rax - lea 64(%rdx), %rdx - .byte 0xc4,66,179,0xf7,210 - mov %rax, 16(%rdi) - lea 64(%rdi), %rdi -.Le1: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - jmp .Ltop - -.L4: xor %r11d, %r11d - jmp .Le4 - -.L3: mov %r10, %r11 - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi - xor %r10d, %r10d - jmp .Le3 - -.L2: lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi - xor %r11d, %r11d - jmp .Le2 - -.L1: mov %r10, %r11 - lea -24(%rsi), %rsi - lea 40(%rdx), %rdx - lea 40(%rdi), %rdi - xor %r10d, %r10d - jmp .Le1 .size __gmpn_addlsh_n,.-__gmpn_addlsh_n - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab - .long .L4-.Ltab - .long .L5-.Ltab - .long .L6-.Ltab - .long .L7-.Ltab - diff --git a/ext/gmp/gen/x86_64-linux/mpn/addmul_1.s b/ext/gmp/gen/x86_64-linux/mpn/addmul_1.s index 715dc68504..8daf1ac3cd 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/addmul_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/addmul_1.s @@ -67,8 +67,6 @@ - - @@ -77,136 +75,122 @@ + + + + + + + + .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_addmul_1 .type __gmpn_addmul_1,@function __gmpn_addmul_1: - - mov %rcx, %r10 - mov %rdx, %rcx - mov %edx, %r8d - shr $3, %rcx - and $7, %r8d - mov %r10, %rdx - lea .Ltab(%rip), %r10 - movslq (%r10,%r8,4), %r8 - lea (%r8, %r10), %r10 - jmp *%r10 - - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .Lf0-.Ltab - .long .Lf1-.Ltab - .long .Lf2-.Ltab - .long .Lf3-.Ltab - .long .Lf4-.Ltab - .long .Lf5-.Ltab - .long .Lf6-.Ltab - .long .Lf7-.Ltab - .text -.Lf0: .byte 0xc4,98,171,0xf6,6 - lea -8(%rsi), %rsi - lea -8(%rdi), %rdi - lea -1(%rcx), %rcx - jmp .Lb0 - -.Lf3: .byte 0xc4,226,179,0xf6,6 - lea 16(%rsi), %rsi - lea -48(%rdi), %rdi - jmp .Lb3 - -.Lf4: .byte 0xc4,98,171,0xf6,6 - lea 24(%rsi), %rsi - lea -40(%rdi), %rdi - jmp .Lb4 - -.Lf5: .byte 0xc4,226,179,0xf6,6 - lea 32(%rsi), %rsi - lea -32(%rdi), %rdi - jmp .Lb5 - -.Lf6: .byte 0xc4,98,171,0xf6,6 - lea 40(%rsi), %rsi - lea -24(%rdi), %rdi - jmp .Lb6 - -.Lf1: .byte 0xc4,226,179,0xf6,6 - jrcxz .L1 - jmp .Lb1 -.L1: add (%rdi), %r9 - mov %r9, (%rdi) - adc %rcx, %rax - - ret -.Lend: .byte 0xf3,76,0x0f,0x38,0xf6,15 - mov %r9, (%rdi) - .byte 0xf3,72,0x0f,0x38,0xf6,193 - adc %rcx, %rax - - ret - nop;nop;nop;nop - -.Lf2: .byte 0xc4,98,171,0xf6,6 - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - .byte 0xc4,226,179,0xf6,6 - - .align 32, 0x90 -.Ltop: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, -8(%rdi) - jrcxz .Lend -.Lb1: .byte 0xc4,98,171,0xf6,70,8 - .byte 0xf3,76,0x0f,0x38,0xf6,15 - lea -1(%rcx), %rcx - mov %r9, (%rdi) - .byte 0x66,76,0x0f,0x38,0xf6,208 -.Lb0: .byte 0xc4,226,179,0xf6,70,16 - .byte 0x66,77,0x0f,0x38,0xf6,200 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) -.Lb7: .byte 0xc4,98,171,0xf6,70,24 - lea 64(%rsi), %rsi - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xf3,76,0x0f,0x38,0xf6,79,16 - mov %r9, 16(%rdi) -.Lb6: .byte 0xc4,226,179,0xf6,70,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, 24(%rdi) -.Lb5: .byte 0xc4,98,171,0xf6,70,232 - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xf3,76,0x0f,0x38,0xf6,79,32 - mov %r9, 32(%rdi) -.Lb4: .byte 0xc4,226,179,0xf6,70,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, 40(%rdi) -.Lb3: .byte 0xf3,76,0x0f,0x38,0xf6,79,48 - .byte 0xc4,98,171,0xf6,70,248 - mov %r9, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xc4,226,179,0xf6,6 - jmp .Ltop - -.Lf7: .byte 0xc4,226,179,0xf6,6 - lea -16(%rsi), %rsi - lea -16(%rdi), %rdi - jmp .Lb7 - .size __gmpn_addmul_1,.-__gmpn_addmul_1 + mov (%rsi), %rax + push %rbx + mov %rdx, %rbx + + mul %rcx + mov %rbx, %r11 + + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jz .Lb2 + jg .Lb3 + +.Lb1: dec %r11 + jne .Lgt1 + add %rax, (%rdi) + jmp .Lret +.Lgt1: lea 8(%rsi,%r11,8), %rsi + lea -8(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (%rsi,%r11,8), %rax + mov %rdx, %r8 + jmp .LL1 + +.Lb0: lea (%rsi,%r11,8), %rsi + lea -16(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp .LL0 + +.Lb3: lea -8(%rsi,%r11,8), %rsi + lea -24(%rdi,%r11,8), %rdi + neg %r11 + mov %rax, %rbx + mov %rdx, %r10 + jmp .LL3 + +.Lb2: lea -16(%rsi,%r11,8), %rsi + lea -32(%rdi,%r11,8), %rdi + neg %r11 + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %rdx, %r9 + jmp .LL2 + + .align 16, 0x90 +.Ltop: add %r10, (%rdi,%r11,8) + adc %rax, %r9 + mov (%rsi,%r11,8), %rax + adc %rdx, %r8 + mov $0, %r10d +.LL1: mul %rcx + add %r9, 8(%rdi,%r11,8) + adc %rax, %r8 + adc %rdx, %rbx +.LL0: mov 8(%rsi,%r11,8), %rax + mul %rcx + add %r8, 16(%rdi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 +.LL3: mov 16(%rsi,%r11,8), %rax + mul %rcx + add %rbx, 24(%rdi,%r11,8) + mov $0, %r8d + mov %r8, %rbx + adc %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %r8, %r9 + adc %rdx, %r9 +.LL2: mul %rcx + add $4, %r11 + js .Ltop + + add %r10, (%rdi,%r11,8) + adc %rax, %r9 + adc %r8, %rdx + add %r9, 8(%rdi,%r11,8) +.Lret: adc $0, %rdx + mov %rdx, %rax + + pop %rbx + + + ret + .size __gmpn_addmul_1,.-__gmpn_addmul_1 diff --git a/ext/gmp/gen/x86_64-linux/mpn/addmul_2.s b/ext/gmp/gen/x86_64-linux/mpn/addmul_2.s index 7fd478bd41..5883dab926 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/addmul_2.s +++ b/ext/gmp/gen/x86_64-linux/mpn/addmul_2.s @@ -83,171 +83,125 @@ + .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_addmul_2 .type __gmpn_addmul_2,@function __gmpn_addmul_2: + mov %rdx, %r11 push %rbx push %rbp - push %r12 - push %r13 - mov (%rcx), %r8 + mov 0(%rcx), %r8 mov 8(%rcx), %r9 - mov %rdx, %r11 - shr $2, %r11 - - test $1, %dl - jnz .Lbx1 - -.Lbx0: mov (%rdi), %r12 - mov 8(%rdi), %r13 - test $2, %dl - jnz .Lb10 - -.Lb00: mov (%rsi), %rdx - lea 16(%rsi), %rsi - .byte 0xc4,194,251,0xf6,200 - add %rax, %r12 - .byte 0xc4,194,251,0xf6,233 - adc $0, %rcx - mov %r12, (%rdi) - add %rax, %r13 - adc $0, %rbp - mov -8(%rsi), %rdx - lea 16(%rdi), %rdi - jmp .Llo0 + mov %edx, %ebx + mov (%rsi), %rax + lea -8(%rsi,%rdx,8), %rsi + lea -8(%rdi,%rdx,8), %rdi + mul %r8 + neg %r11 + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jc .Lb1 + jz .Lb2 + +.Lb3: mov %rax, %rcx + mov %rdx, %rbp + xor %r10d, %r10d + mov 8(%rsi,%r11,8), %rax + dec %r11 + jmp .Llo3 -.Lb10: mov (%rsi), %rdx - inc %r11 - .byte 0xc4,194,251,0xf6,200 - add %rax, %r12 - adc $0, %rcx - .byte 0xc4,194,251,0xf6,233 - mov %r12, (%rdi) - mov 16(%rdi), %r12 - add %rax, %r13 - adc $0, %rbp - xor %rbx, %rbx +.Lb2: mov %rax, %rbp + mov 8(%rsi,%r11,8), %rax + mov %rdx, %r10 + xor %ebx, %ebx + add $-2, %r11 jmp .Llo2 -.Lbx1: mov (%rdi), %r13 - mov 8(%rdi), %r12 - test $2, %dl - jnz .Lb11 - -.Lb01: mov (%rsi), %rdx - .byte 0xc4,66,251,0xf6,208 - add %rax, %r13 - adc $0, %r10 - .byte 0xc4,194,251,0xf6,217 - add %rax, %r12 - adc $0, %rbx - mov 8(%rsi), %rdx - mov %r13, (%rdi) - mov 16(%rdi), %r13 - .byte 0xc4,194,251,0xf6,200 - lea 24(%rdi), %rdi - lea 24(%rsi), %rsi +.Lb1: mov %rax, %r10 + mov 8(%rsi,%r11,8), %rax + mov %rdx, %rbx + xor %ecx, %ecx + inc %r11 jmp .Llo1 -.Lb11: mov (%rsi), %rdx - inc %r11 - .byte 0xc4,66,251,0xf6,208 - add %rax, %r13 - adc $0, %r10 - .byte 0xc4,194,251,0xf6,217 - add %rax, %r12 - adc $0, %rbx - mov %r13, (%rdi) - mov 8(%rsi), %rdx - .byte 0xc4,194,251,0xf6,200 - lea 8(%rdi), %rdi - lea 8(%rsi), %rsi - jmp .Llo3 +.Lb0: mov $0, %r10d + mov %rax, %rbx + mov 8(%rsi,%r11,8), %rax + mov %rdx, %rcx + xor %ebp, %ebp + jmp .Llo0 + + .align 32, 0x90 +.Ltop: mov $0, %ecx + mul %r8 + add %rax, %r10 + mov (%rsi,%r11,8), %rax + adc %rdx, %rbx + adc $0, %ecx +.Llo1: mul %r9 + add %r10, (%rdi,%r11,8) + mov $0, %r10d + adc %rax, %rbx + mov $0, %ebp + mov 8(%rsi,%r11,8), %rax + adc %rdx, %rcx + mul %r8 + add %rax, %rbx + mov 8(%rsi,%r11,8), %rax + adc %rdx, %rcx + adc $0, %ebp +.Llo0: mul %r9 + add %rbx, 8(%rdi,%r11,8) + adc %rax, %rcx + adc %rdx, %rbp + mov 16(%rsi,%r11,8), %rax + mul %r8 + add %rax, %rcx + adc %rdx, %rbp + adc $0, %r10d + mov 16(%rsi,%r11,8), %rax +.Llo3: mul %r9 + add %rcx, 16(%rdi,%r11,8) + adc %rax, %rbp + adc %rdx, %r10 + xor %ebx, %ebx + mov 24(%rsi,%r11,8), %rax + mul %r8 + add %rax, %rbp + mov 24(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc $0, %ebx +.Llo2: mul %r9 + add %rbp, 24(%rdi,%r11,8) + adc %rax, %r10 + adc %rdx, %rbx + mov 32(%rsi,%r11,8), %rax + add $4, %r11 + js .Ltop + +.Lend: xor %ecx, %ecx + mul %r8 + add %rax, %r10 + mov (%rsi), %rax + adc %rdx, %rbx + adc %ecx, %ecx + mul %r9 + add %r10, (%rdi) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%rdi) + mov %rcx, %rax - .align 16, 0x90 -.Ltop: .byte 0xc4,66,251,0xf6,208 - add %rbx, %r13 - adc $0, %rbp - add %rax, %r13 - adc $0, %r10 - .byte 0xc4,194,251,0xf6,217 - add %rax, %r12 - adc $0, %rbx - lea 32(%rdi), %rdi - add %rcx, %r13 - mov -16(%rsi), %rdx - mov %r13, -24(%rdi) - adc $0, %r10 - add %rbp, %r12 - mov -8(%rdi), %r13 - .byte 0xc4,194,251,0xf6,200 - adc $0, %rbx -.Llo1: add %rax, %r12 - .byte 0xc4,194,251,0xf6,233 - adc $0, %rcx - add %r10, %r12 - mov %r12, -16(%rdi) - adc $0, %rcx - add %rax, %r13 - adc $0, %rbp - add %rbx, %r13 - mov -8(%rsi), %rdx - adc $0, %rbp -.Llo0: .byte 0xc4,66,251,0xf6,208 - add %rax, %r13 - adc $0, %r10 - mov (%rdi), %r12 - .byte 0xc4,194,251,0xf6,217 - add %rax, %r12 - adc $0, %rbx - add %rcx, %r13 - mov %r13, -8(%rdi) - adc $0, %r10 - mov (%rsi), %rdx - add %rbp, %r12 - .byte 0xc4,194,251,0xf6,200 - adc $0, %rbx -.Llo3: add %rax, %r12 - adc $0, %rcx - .byte 0xc4,194,251,0xf6,233 - add %r10, %r12 - mov 8(%rdi), %r13 - mov %r12, (%rdi) - mov 16(%rdi), %r12 - adc $0, %rcx - add %rax, %r13 - adc $0, %rbp -.Llo2: mov 8(%rsi), %rdx - lea 32(%rsi), %rsi - dec %r11 - jnz .Ltop - -.Lend: .byte 0xc4,66,251,0xf6,208 - add %rbx, %r13 - adc $0, %rbp - add %rax, %r13 - adc $0, %r10 - .byte 0xc4,194,235,0xf6,193 - add %rcx, %r13 - mov %r13, 8(%rdi) - adc $0, %r10 - add %rbp, %rdx - adc $0, %rax - add %r10, %rdx - mov %rdx, 16(%rdi) - adc $0, %rax - - pop %r13 - pop %r12 pop %rbp pop %rbx diff --git a/ext/gmp/gen/x86_64-linux/mpn/and_n.s b/ext/gmp/gen/x86_64-linux/mpn/and_n.s index 0bdc08b1fb..946906ecf6 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/and_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/and_n.s @@ -90,7 +90,6 @@ - .text @@ -103,50 +102,42 @@ __gmpn_and_n: mov (%rdx), %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: and (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: and (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx +.Lb01: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 -.Lb00: mov 8(%rdx), %r9 - and (%rsi), %r8 - and 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 -.Le10: mov 24(%rdx), %r9 - lea 32(%rdx), %rdx - and 16(%rsi), %r8 - and 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + and (%rsi,%rcx,8), %r8 + and 8(%rsi,%rcx,8), %r9 + nop + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + and 16(%rsi,%rcx,8), %r8 + and 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/andn_n.s b/ext/gmp/gen/x86_64-linux/mpn/andn_n.s index 73fe85c5fd..aee1df4efc 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/andn_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/andn_n.s @@ -92,7 +92,6 @@ - .text @@ -106,54 +105,46 @@ __gmpn_andn_n: mov (%rdx), %r8 not %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: and (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: and (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 +.Ltop: mov (%rdx,%rcx,8), %r8 not %r8 -.Lb00: mov 8(%rdx), %r9 +.Lb00: mov 8(%rdx,%rcx,8), %r9 not %r9 - and (%rsi), %r8 - and 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 + and (%rsi,%rcx,8), %r8 + and 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 not %r8 -.Le10: mov 24(%rdx), %r9 +.Le10: mov 24(%rdx,%rcx,8), %r9 not %r9 - lea 32(%rdx), %rdx - and 16(%rsi), %r8 - and 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + and 16(%rsi,%rcx,8), %r8 + and 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s b/ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s index 3c96e43ecb..4f58778551 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s @@ -145,63 +145,46 @@ __gmpn_pi1_bdiv_q_1: dec %r10 jz .Lone - lea 8(%rsi,%r10,8), %rsi + mov 8(%rsi), %rdx + lea (%rsi,%r10,8), %rsi lea (%rdi,%r10,8), %rdi neg %r10 - test %ecx, %ecx - jnz .Lunorm + shrd %cl, %rdx, %rax + xor %ebx, %ebx - jmp .Lnent + jmp .Lent .align 8, 0x90 -.Lntop:mul %r11 - mov -8(%rsi,%r10,8), %rax +.Ltop: + + + + + + + + mul %r11 + mov (%rsi,%r10,8), %rax + mov 8(%rsi,%r10,8), %r9 + shrd %cl, %r9, %rax + nop sub %rbx, %rax - setc %bl + setc %bl sub %rdx, %rax - adc $0, %ebx -.Lnent:imul %r8, %rax - mov %rax, (%rdi,%r10,8) - inc %r10 - jnz .Lntop - - mov -8(%rsi), %r9 - jmp .Lcom - -.Lunorm: - mov (%rsi,%r10,8), %r9 - shr %cl, %rax - neg %ecx - shl %cl, %r9 - neg %ecx - or %r9, %rax - xor %ebx, %ebx - jmp .Luent - - .align 8, 0x90 -.Lutop:mul %r11 - mov (%rsi,%r10,8), %rax - shl %cl, %rax - neg %ecx - or %r9, %rax + adc $0, %ebx +.Lent: imul %r8, %rax + mov %rax, (%rdi,%r10,8) + inc %r10 + jnz .Ltop + + mul %r11 + mov (%rsi), %rax + shr %cl, %rax sub %rbx, %rax - setc %bl sub %rdx, %rax - adc $0, %ebx -.Luent:imul %r8, %rax - mov (%rsi,%r10,8), %r9 - shr %cl, %r9 - neg %ecx - mov %rax, (%rdi,%r10,8) - inc %r10 - jnz .Lutop - -.Lcom: mul %r11 - sub %rbx, %r9 - sub %rdx, %r9 - imul %r8, %r9 - mov %r9, (%rdi) + imul %r8, %rax + mov %rax, (%rdi) pop %rbx ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s b/ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s index 063d5dc7d7..b046e3642c 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s @@ -60,11 +60,6 @@ - - - - - @@ -73,7 +68,9 @@ - + + + @@ -92,92 +89,101 @@ __gmpn_cnd_add_n: push %rbx + push %rbp + push %r12 + push %r13 + push %r14 neg %rdi - sbb %rbx, %rbx - - test $1, %r8b - jz .Lx0 -.Lx1: test $2, %r8b - jz .Lb1 - -.Lb3: mov (%rcx), %rdi - mov 8(%rcx), %r9 - mov 16(%rcx), %r10 - and %rbx, %rdi - and %rbx, %r9 - and %rbx, %r10 - add (%rdx), %rdi - mov %rdi, (%rsi) - adc 8(%rdx), %r9 - mov %r9, 8(%rsi) - adc 16(%rdx), %r10 - mov %r10, 16(%rsi) + sbb %rdi, %rdi + + lea (%rcx,%r8,8), %rcx + lea (%rdx,%r8,8), %rdx + lea (%rsi,%r8,8), %rsi + + mov %r8d, %eax + neg %r8 + and $3, %eax + jz .Ltop + cmp $2, %eax + jc .Lb1 + jz .Lb2 + +.Lb3: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + add %r12, %r10 + mov %r10, (%rsi,%r8,8) + adc %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + adc %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) sbb %eax, %eax - lea 24(%rdx), %rdx - lea 24(%rcx), %rcx - lea 24(%rsi), %rsi - sub $3, %r8 - jnz .Ltop + add $3, %r8 + js .Ltop jmp .Lend -.Lx0: xor %eax, %eax - test $2, %r8b - jz .Ltop - -.Lb2: mov (%rcx), %rdi - mov 8(%rcx), %r9 - and %rbx, %rdi - and %rbx, %r9 - add (%rdx), %rdi - mov %rdi, (%rsi) - adc 8(%rdx), %r9 - mov %r9, 8(%rsi) +.Lb2: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r13 + add %r12, %r10 + mov %r10, (%rsi,%r8,8) + adc %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) sbb %eax, %eax - lea 16(%rdx), %rdx - lea 16(%rcx), %rcx - lea 16(%rsi), %rsi - sub $2, %r8 - jnz .Ltop + add $2, %r8 + js .Ltop jmp .Lend -.Lb1: mov (%rcx), %rdi - and %rbx, %rdi - add (%rdx), %rdi - mov %rdi, (%rsi) +.Lb1: mov (%rcx,%r8,8), %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + add %r12, %r10 + mov %r10, (%rsi,%r8,8) sbb %eax, %eax - lea 8(%rdx), %rdx - lea 8(%rcx), %rcx - lea 8(%rsi), %rsi - dec %r8 - jz .Lend + add $1, %r8 + jns .Lend .align 16, 0x90 -.Ltop: mov (%rcx), %rdi - mov 8(%rcx), %r9 - mov 16(%rcx), %r10 - mov 24(%rcx), %r11 - lea 32(%rcx), %rcx - and %rbx, %rdi - and %rbx, %r9 - and %rbx, %r10 - and %rbx, %r11 +.Ltop: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + mov 24(%rcx,%r8,8), %r11 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + and %rdi, %r11 + mov 24(%rdx,%r8,8), %r9 add %eax, %eax - adc (%rdx), %rdi - mov %rdi, (%rsi) - adc 8(%rdx), %r9 - mov %r9, 8(%rsi) - adc 16(%rdx), %r10 - mov %r10, 16(%rsi) - adc 24(%rdx), %r11 - lea 32(%rdx), %rdx - mov %r11, 24(%rsi) - lea 32(%rsi), %rsi + adc %r12, %r10 + mov %r10, (%rsi,%r8,8) + adc %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + adc %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) + adc %r11, %r9 + mov %r9, 24(%rsi,%r8,8) sbb %eax, %eax - sub $4, %r8 - jnz .Ltop + add $4, %r8 + js .Ltop .Lend: neg %eax + pop %r14 + pop %r13 + pop %r12 + pop %rbp pop %rbx ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s b/ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s index 40b0e30be4..596dd8fd48 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s @@ -60,14 +60,6 @@ - - - - - - - - @@ -75,8 +67,6 @@ - - @@ -102,102 +92,95 @@ __gmpn_cnd_sub_n: push %rbp push %r12 push %r13 + push %r14 neg %rdi - sbb %rbx, %rbx - - test $1, %r8b - jz .Lx0 -.Lx1: test $2, %r8b - jz .Lb1 - -.Lb3: mov (%rcx), %rdi - mov 8(%rcx), %r9 - mov 16(%rcx), %r10 - and %rbx, %rdi - mov (%rdx), %r12 - and %rbx, %r9 - mov 8(%rdx), %r13 - and %rbx, %r10 - mov 16(%rdx), %rbp - sub %rdi, %r12 - mov %r12, (%rsi) - sbb %r9, %r13 - mov %r13, 8(%rsi) - sbb %r10, %rbp - mov %rbp, 16(%rsi) + sbb %rdi, %rdi + + lea (%rcx,%r8,8), %rcx + lea (%rdx,%r8,8), %rdx + lea (%rsi,%r8,8), %rsi + + mov %r8d, %eax + neg %r8 + and $3, %eax + jz .Ltop + cmp $2, %eax + jc .Lb1 + jz .Lb2 + +.Lb3: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + sub %r12, %r10 + mov %r10, (%rsi,%r8,8) + sbb %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + sbb %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) sbb %eax, %eax - lea 24(%rdx), %rdx - lea 24(%rcx), %rcx - lea 24(%rsi), %rsi - sub $3, %r8 - jnz .Ltop + add $3, %r8 + js .Ltop jmp .Lend -.Lx0: xor %eax, %eax - test $2, %r8b - jz .Ltop - -.Lb2: mov (%rcx), %rdi - mov 8(%rcx), %r9 - mov (%rdx), %r12 - and %rbx, %rdi - mov 8(%rdx), %r13 - and %rbx, %r9 - sub %rdi, %r12 - mov %r12, (%rsi) - sbb %r9, %r13 - mov %r13, 8(%rsi) +.Lb2: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r13 + sub %r12, %r10 + mov %r10, (%rsi,%r8,8) + sbb %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) sbb %eax, %eax - lea 16(%rdx), %rdx - lea 16(%rcx), %rcx - lea 16(%rsi), %rsi - sub $2, %r8 - jnz .Ltop + add $2, %r8 + js .Ltop jmp .Lend -.Lb1: mov (%rcx), %rdi - mov (%rdx), %r12 - and %rbx, %rdi - sub %rdi, %r12 - mov %r12, (%rsi) +.Lb1: mov (%rcx,%r8,8), %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + sub %r12, %r10 + mov %r10, (%rsi,%r8,8) sbb %eax, %eax - lea 8(%rdx), %rdx - lea 8(%rcx), %rcx - lea 8(%rsi), %rsi - dec %r8 - jz .Lend + add $1, %r8 + jns .Lend .align 16, 0x90 -.Ltop: mov (%rcx), %rdi - mov 8(%rcx), %r9 - mov 16(%rcx), %r10 - mov 24(%rcx), %r11 - lea 32(%rcx), %rcx - and %rbx, %rdi - mov (%rdx), %r12 - and %rbx, %r9 - mov 8(%rdx), %r13 - and %rbx, %r10 - mov 16(%rdx), %rbp - and %rbx, %r11 +.Ltop: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + mov 24(%rcx,%r8,8), %r11 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + and %rdi, %r11 + mov 24(%rdx,%r8,8), %r9 add %eax, %eax - mov 24(%rdx), %rax - lea 32(%rdx), %rdx - sbb %rdi, %r12 - mov %r12, (%rsi) - sbb %r9, %r13 - mov %r13, 8(%rsi) - sbb %r10, %rbp - mov %rbp, 16(%rsi) - sbb %r11, %rax - mov %rax, 24(%rsi) - lea 32(%rsi), %rsi + sbb %r12, %r10 + mov %r10, (%rsi,%r8,8) + sbb %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + sbb %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) + sbb %r11, %r9 + mov %r9, 24(%rsi,%r8,8) sbb %eax, %eax - sub $4, %r8 - jnz .Ltop + add $4, %r8 + js .Ltop .Lend: neg %eax + pop %r14 pop %r13 pop %r12 pop %rbp diff --git a/ext/gmp/gen/x86_64-linux/mpn/com.s b/ext/gmp/gen/x86_64-linux/mpn/com.s index 9d4f49cfc0..ff14001990 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/com.s +++ b/ext/gmp/gen/x86_64-linux/mpn/com.s @@ -39,44 +39,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -99,237 +61,50 @@ .text - .align 64, 0x90 + .align 32, 0x90 .globl __gmpn_com .type __gmpn_com,@function __gmpn_com: - - cmp $7, %rdx - jbe .Lbc - - pcmpeqb %xmm5, %xmm5 - - test $8, %dil - jz .Lrp_aligned - - mov (%rsi), %r8 - lea 8(%rsi), %rsi - not %r8 - mov %r8, (%rdi) - lea 8(%rdi), %rdi - dec %rdx - -.Lrp_aligned: - test $8, %sil - jnz .Luent - - jmp .Lam - - .align 16, 0x90 -.Latop:movaps 0(%rsi), %xmm0 - movaps 16(%rsi), %xmm1 - movaps 32(%rsi), %xmm2 - movaps 48(%rsi), %xmm3 - lea 64(%rsi), %rsi - pxor %xmm5, %xmm0 - pxor %xmm5, %xmm1 - pxor %xmm5, %xmm2 - pxor %xmm5, %xmm3 - movaps %xmm0, (%rdi) - movaps %xmm1, 16(%rdi) - movaps %xmm2, 32(%rdi) - movaps %xmm3, 48(%rdi) - lea 64(%rdi), %rdi -.Lam: sub $8, %rdx - jnc .Latop - - test $4, %dl - jz 1f - movaps (%rsi), %xmm0 - movaps 16(%rsi), %xmm1 - lea 32(%rsi), %rsi - pxor %xmm5, %xmm0 - pxor %xmm5, %xmm1 - movaps %xmm0, (%rdi) - movaps %xmm1, 16(%rdi) - lea 32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movaps (%rsi), %xmm0 - lea 16(%rsi), %rsi - pxor %xmm5, %xmm0 - movaps %xmm0, (%rdi) - lea 16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - not %r8 - mov %r8, (%rdi) - -1: - ret - -.Luent: - - - - - lea -40(%rsi), %rax - sub %rdi, %rax - cmp $80, %rax - jbe .Lbc - - sub $16, %rdx - jc .Luend - - movaps 120(%rsi), %xmm3 - - sub $16, %rdx - jmp .Lum - - .align 16, 0x90 -.Lutop:movaps 120(%rsi), %xmm3 - pxor %xmm5, %xmm0 - movaps %xmm0, -128(%rdi) - sub $16, %rdx -.Lum: movaps 104(%rsi), %xmm2 - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movaps 88(%rsi), %xmm1 - pxor %xmm5, %xmm3 - movaps %xmm3, 112(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movaps 72(%rsi), %xmm0 - pxor %xmm5, %xmm2 - movaps %xmm2, 96(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps 56(%rsi), %xmm3 - pxor %xmm5, %xmm1 - movaps %xmm1, 80(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,195,8 - movaps 40(%rsi), %xmm2 - pxor %xmm5, %xmm0 - movaps %xmm0, 64(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movaps 24(%rsi), %xmm1 - pxor %xmm5, %xmm3 - movaps %xmm3, 48(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movaps 8(%rsi), %xmm0 - pxor %xmm5, %xmm2 - movaps %xmm2, 32(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps -8(%rsi), %xmm3 - pxor %xmm5, %xmm1 - movaps %xmm1, 16(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 128(%rsi), %rsi - lea 128(%rdi), %rdi - jnc .Lutop - - pxor %xmm5, %xmm0 - movaps %xmm0, -128(%rdi) - -.Luend:test $8, %dl - jz 1f - movaps 56(%rsi), %xmm3 - movaps 40(%rsi), %xmm2 - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movaps 24(%rsi), %xmm1 - pxor %xmm5, %xmm3 - movaps %xmm3, 48(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movaps 8(%rsi), %xmm0 - pxor %xmm5, %xmm2 - movaps %xmm2, 32(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps -8(%rsi), %xmm3 - pxor %xmm5, %xmm1 - movaps %xmm1, 16(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 64(%rsi), %rsi - pxor %xmm5, %xmm0 - movaps %xmm0, (%rdi) - lea 64(%rdi), %rdi - -1: test $4, %dl - jz 1f - movaps 24(%rsi), %xmm1 - movaps 8(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps -8(%rsi), %xmm3 - pxor %xmm5, %xmm1 - movaps %xmm1, 16(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 32(%rsi), %rsi - pxor %xmm5, %xmm0 - movaps %xmm0, (%rdi) - lea 32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movaps 8(%rsi), %xmm0 - movaps -8(%rsi), %xmm3 - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 16(%rsi), %rsi - pxor %xmm5, %xmm0 - movaps %xmm0, (%rdi) - lea 16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - not %r8 - mov %r8, (%rdi) - -1: - ret - - - - -.Lbc: lea -8(%rdi), %rdi - sub $4, %edx - jc .Lend - - .align 16, 0x90 -.Ltop: mov (%rsi), %r8 - mov 8(%rsi), %r9 - lea 32(%rdi), %rdi - mov 16(%rsi), %r10 - mov 24(%rsi), %r11 - lea 32(%rsi), %rsi - not %r8 - not %r9 - not %r10 - not %r11 - mov %r8, -24(%rdi) - mov %r9, -16(%rdi) - sub $4, %edx - mov %r10, -8(%rdi) - mov %r11, (%rdi) - jnc .Ltop - -.Lend: test $1, %dl - jz 1f - mov (%rsi), %r8 - not %r8 - mov %r8, 8(%rdi) - lea 8(%rdi), %rdi - lea 8(%rsi), %rsi -1: test $2, %dl - jz 1f - mov (%rsi), %r8 - mov 8(%rsi), %r9 - not %r8 - not %r9 - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) -1: + movq (%rsi), %r8 + movl %edx, %eax + leaq (%rsi,%rdx,8), %rsi + leaq (%rdi,%rdx,8), %rdi + negq %rdx + andl $3, %eax + je .Lb00 + cmpl $2, %eax + jc .Lb01 + je .Lb10 + +.Lb11: notq %r8 + movq %r8, (%rdi,%rdx,8) + decq %rdx + jmp .Le11 +.Lb10: addq $-2, %rdx + jmp .Le10 + .byte 0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: notq %r8 + movq %r8, (%rdi,%rdx,8) + incq %rdx + jz .Lret + +.Loop: movq (%rsi,%rdx,8), %r8 +.Lb00: movq 8(%rsi,%rdx,8), %r9 + notq %r8 + notq %r9 + movq %r8, (%rdi,%rdx,8) + movq %r9, 8(%rdi,%rdx,8) +.Le11: movq 16(%rsi,%rdx,8), %r8 +.Le10: movq 24(%rsi,%rdx,8), %r9 + notq %r8 + notq %r9 + movq %r8, 16(%rdi,%rdx,8) + movq %r9, 24(%rdi,%rdx,8) + addq $4, %rdx + jnc .Loop +.Lret: ret .size __gmpn_com,.-__gmpn_com - diff --git a/ext/gmp/gen/x86_64-linux/mpn/copyd.s b/ext/gmp/gen/x86_64-linux/mpn/copyd.s index 583e8c9ec5..f375481084 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/copyd.s +++ b/ext/gmp/gen/x86_64-linux/mpn/copyd.s @@ -45,35 +45,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -102,178 +73,36 @@ __gmpn_copyd: - - lea -8(%rsi,%rdx,8), %rsi - lea -8(%rdi,%rdx,8), %rdi - - cmp $7, %rdx - jbe .Lbc - - test $8, %dil - jnz .Lrp_aligned - - mov (%rsi), %rax - mov %rax, (%rdi) - lea -8(%rsi), %rsi - lea -8(%rdi), %rdi - dec %rdx - -.Lrp_aligned: - test $8, %sil - jz .Luent - - jmp .Lam - - .align 16, 0x90 -.Latop:movaps -8(%rsi), %xmm0 - movaps -24(%rsi), %xmm1 - movaps -40(%rsi), %xmm2 - movaps -56(%rsi), %xmm3 - lea -64(%rsi), %rsi - movaps %xmm0, -8(%rdi) - movaps %xmm1, -24(%rdi) - movaps %xmm2, -40(%rdi) - movaps %xmm3, -56(%rdi) - lea -64(%rdi), %rdi -.Lam: sub $8, %rdx - jnc .Latop - - test $4, %dl - jz 1f - movaps -8(%rsi), %xmm0 - movaps -24(%rsi), %xmm1 - lea -32(%rsi), %rsi - movaps %xmm0, -8(%rdi) - movaps %xmm1, -24(%rdi) - lea -32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movaps -8(%rsi), %xmm0 - lea -16(%rsi), %rsi - movaps %xmm0, -8(%rdi) - lea -16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, (%rdi) - -1: - ret - -.Luent:sub $16, %rdx - movaps (%rsi), %xmm0 - jc .Luend - - .align 16, 0x90 -.Lutop:sub $16, %rdx - movaps -16(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -8(%rdi) - movaps -32(%rsi), %xmm2 - .byte 0x66,0x0f,0x3a,0x0f,202,8 - movaps %xmm1, -24(%rdi) - movaps -48(%rsi), %xmm3 - .byte 0x66,0x0f,0x3a,0x0f,211,8 - movaps %xmm2, -40(%rdi) - movaps -64(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,216,8 - movaps %xmm3, -56(%rdi) - movaps -80(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -72(%rdi) - movaps -96(%rsi), %xmm2 - .byte 0x66,0x0f,0x3a,0x0f,202,8 - movaps %xmm1, -88(%rdi) - movaps -112(%rsi), %xmm3 - .byte 0x66,0x0f,0x3a,0x0f,211,8 - movaps %xmm2, -104(%rdi) - movaps -128(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,216,8 - movaps %xmm3, -120(%rdi) - lea -128(%rsi), %rsi - lea -128(%rdi), %rdi - jnc .Lutop - -.Luend:test $8, %dl - jz 1f - movaps -16(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -8(%rdi) - movaps -32(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps %xmm1, -24(%rdi) - movaps -48(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -40(%rdi) - movaps -64(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps %xmm1, -56(%rdi) - lea -64(%rsi), %rsi - lea -64(%rdi), %rdi - -1: test $4, %dl - jz 1f - movaps -16(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -8(%rdi) - movaps -32(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps %xmm1, -24(%rdi) - lea -32(%rsi), %rsi - lea -32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movaps -16(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -8(%rdi) - lea -16(%rsi), %rsi - lea -16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, (%rdi) - -1: - ret - - - - -.Lbc: sub $4, %edx + lea (%rdi,%rdx,8), %rdi + sub $4, %rdx jc .Lend + nop - .align 16, 0x90 -.Ltop: mov (%rsi), %r8 +.Ltop: mov (%rsi), %rax mov -8(%rsi), %r9 lea -32(%rdi), %rdi mov -16(%rsi), %r10 mov -24(%rsi), %r11 lea -32(%rsi), %rsi - mov %r8, 32(%rdi) - mov %r9, 24(%rdi) - - mov %r10, 16(%rdi) - mov %r11, 8(%rdi) - - -.Lend: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, (%rdi) + mov %rax, 24(%rdi) + mov %r9, 16(%rdi) + sub $4, %rdx + mov %r10, 8(%rdi) + mov %r11, (%rdi) + jnc .Ltop + +.Lend: shr %edx + jnc 1f + mov (%rsi), %rax + mov %rax, -8(%rdi) lea -8(%rdi), %rdi lea -8(%rsi), %rsi -1: test $2, %dl - jz 1f - mov (%rsi), %r8 +1: shr %edx + jnc 1f + mov (%rsi), %rax mov -8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, -8(%rdi) -1: - ret + mov %rax, -8(%rdi) + mov %r9, -16(%rdi) +1: ret .size __gmpn_copyd,.-__gmpn_copyd - diff --git a/ext/gmp/gen/x86_64-linux/mpn/copyi.s b/ext/gmp/gen/x86_64-linux/mpn/copyi.s index a5c971baa6..dc746b2270 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/copyi.s +++ b/ext/gmp/gen/x86_64-linux/mpn/copyi.s @@ -45,38 +45,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -100,225 +68,40 @@ .text .align 64, 0x90 + .byte 0,0,0,0,0,0 .globl __gmpn_copyi .type __gmpn_copyi,@function __gmpn_copyi: - - - cmp $7, %rdx - jbe .Lbc - - test $8, %dil - jz .Lrp_aligned - - movsq - dec %rdx - -.Lrp_aligned: - test $8, %sil - jnz .Luent - - jmp .Lam - - .align 16, 0x90 -.Latop:movdqa 0(%rsi), %xmm0 - movdqa 16(%rsi), %xmm1 - movdqa 32(%rsi), %xmm2 - movdqa 48(%rsi), %xmm3 - lea 64(%rsi), %rsi - movdqa %xmm0, (%rdi) - movdqa %xmm1, 16(%rdi) - movdqa %xmm2, 32(%rdi) - movdqa %xmm3, 48(%rdi) - lea 64(%rdi), %rdi -.Lam: sub $8, %rdx - jnc .Latop - - test $4, %dl - jz 1f - movdqa (%rsi), %xmm0 - movdqa 16(%rsi), %xmm1 - lea 32(%rsi), %rsi - movdqa %xmm0, (%rdi) - movdqa %xmm1, 16(%rdi) - lea 32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movdqa (%rsi), %xmm0 - lea 16(%rsi), %rsi - movdqa %xmm0, (%rdi) - lea 16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, (%rdi) - -1: - ret - -.Luent: - - - cmp $16, %rdx - jc .Lued0 - - - - - - - movaps 120(%rsi), %xmm7 - movaps 104(%rsi), %xmm6 - movaps 88(%rsi), %xmm5 - movaps 72(%rsi), %xmm4 - movaps 56(%rsi), %xmm3 - movaps 40(%rsi), %xmm2 - lea 128(%rsi), %rsi - sub $32, %rdx - jc .Lued1 - - .align 16, 0x90 -.Lutop:movaps -104(%rsi), %xmm1 - sub $16, %rdx - movaps -120(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,254,8 - movaps -136(%rsi), %xmm8 - movdqa %xmm7, 112(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,245,8 - movaps 120(%rsi), %xmm7 - movdqa %xmm6, 96(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,236,8 - movaps 104(%rsi), %xmm6 - movdqa %xmm5, 80(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,227,8 - movaps 88(%rsi), %xmm5 - movdqa %xmm4, 64(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movaps 72(%rsi), %xmm4 - movdqa %xmm3, 48(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movaps 56(%rsi), %xmm3 - movdqa %xmm2, 32(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps 40(%rsi), %xmm2 - movdqa %xmm1, 16(%rdi) - .byte 0x66,65,0x0f,0x3a,0x0f,192,8 - lea 128(%rsi), %rsi - movdqa %xmm0, (%rdi) - lea 128(%rdi), %rdi - jnc .Lutop - -.Lued1:movaps -104(%rsi), %xmm1 - movaps -120(%rsi), %xmm0 - movaps -136(%rsi), %xmm8 - .byte 0x66,0x0f,0x3a,0x0f,254,8 - movdqa %xmm7, 112(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,245,8 - movdqa %xmm6, 96(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,236,8 - movdqa %xmm5, 80(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,227,8 - movdqa %xmm4, 64(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movdqa %xmm3, 48(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movdqa %xmm2, 32(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movdqa %xmm1, 16(%rdi) - .byte 0x66,65,0x0f,0x3a,0x0f,192,8 - movdqa %xmm0, (%rdi) - lea 128(%rdi), %rdi - - - - - - -.Lued0:test $8, %dl - jz 1f - movaps 56(%rsi), %xmm3 - movaps 40(%rsi), %xmm2 - movaps 24(%rsi), %xmm1 - movaps 8(%rsi), %xmm0 - movaps -8(%rsi), %xmm4 - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movdqa %xmm3, 48(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movdqa %xmm2, 32(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movdqa %xmm1, 16(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,196,8 - lea 64(%rsi), %rsi - movdqa %xmm0, (%rdi) - lea 64(%rdi), %rdi - -1: test $4, %dl - jz 1f - movaps 24(%rsi), %xmm1 - movaps 8(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps -8(%rsi), %xmm3 - movdqa %xmm1, 16(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 32(%rsi), %rsi - movdqa %xmm0, (%rdi) - lea 32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movdqa 8(%rsi), %xmm0 - movdqa -8(%rsi), %xmm3 - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 16(%rsi), %rsi - movdqa %xmm0, (%rdi) - lea 16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, (%rdi) - -1: - ret - - - - -.Lbc: lea -8(%rdi), %rdi - sub $4, %edx + lea -8(%rdi), %rdi + sub $4, %rdx jc .Lend - .align 16, 0x90 -.Ltop: mov (%rsi), %r8 +.Ltop: mov (%rsi), %rax mov 8(%rsi), %r9 lea 32(%rdi), %rdi mov 16(%rsi), %r10 mov 24(%rsi), %r11 lea 32(%rsi), %rsi - mov %r8, -24(%rdi) + mov %rax, -24(%rdi) mov %r9, -16(%rdi) - + sub $4, %rdx mov %r10, -8(%rdi) mov %r11, (%rdi) + jnc .Ltop - -.Lend: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, 8(%rdi) +.Lend: shr %edx + jnc 1f + mov (%rsi), %rax + mov %rax, 8(%rdi) lea 8(%rdi), %rdi lea 8(%rsi), %rsi -1: test $2, %dl - jz 1f - mov (%rsi), %r8 +1: shr %edx + jnc 1f + mov (%rsi), %rax mov 8(%rsi), %r9 - mov %r8, 8(%rdi) + mov %rax, 8(%rdi) mov %r9, 16(%rdi) -1: - ret +1: ret .size __gmpn_copyi,.-__gmpn_copyi - diff --git a/ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s b/ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s index 652beccbf2..fd8ce8e9e6 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s @@ -158,17 +158,18 @@ __gmpn_div_qr_1n_pi1: dec %r8 mov %rcx, %rax jz .Lfinal + mov $0, %r14d .align 16, 0x90 + .Lloop: - mov %r9, %r14 + cmovc %r9, %r14 mov %r12, %r15 - and %r12, %r14 neg %r15 mul %r9 add %rdx, %r14 @@ -195,6 +196,7 @@ __gmpn_div_qr_1n_pi1: mov %r10, %rax adc %rdx, %rax mov %r14, (%rdi, %r8, 8) + mov $0, %r14d sbb %r12, %r12 dec %r8 mov %rax, %rcx diff --git a/ext/gmp/gen/x86_64-linux/mpn/divrem_1.s b/ext/gmp/gen/x86_64-linux/mpn/divrem_1.s index 5363432e8d..e689bd27f4 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/divrem_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/divrem_1.s @@ -64,20 +64,6 @@ - - - - - - - - - - - - - - @@ -347,4 +333,3 @@ __gmpn_divrem_1: ret .size __gmpn_divrem_1,.-__gmpn_divrem_1 - diff --git a/ext/gmp/gen/x86_64-linux/mpn/gcd_11.s b/ext/gmp/gen/x86_64-linux/mpn/gcd_11.s index 4647639cd9..cf35d253b9 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/gcd_11.s +++ b/ext/gmp/gen/x86_64-linux/mpn/gcd_11.s @@ -76,13 +76,140 @@ - - - - - - - + .section .rodata + .align 64, 0x90 +ctz_table: + + .byte 7 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 6 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + + .size ctz_table,.-ctz_table @@ -99,22 +226,31 @@ __gmpn_gcd_11: - jmp .Lodd + mov ctz_table@GOTPCREL(%rip), %r8 + + + jmp .Lent .align 16, 0x90 .Ltop: cmovc %rdx, %rdi cmovc %rax, %rsi +.Lmid: and $127, %edx + movzbl (%r8,%rdx), %ecx + jz .Lshift_alot shr %cl, %rdi -.Lodd: mov %rsi, %rdx - sub %rdi, %rdx - bsf %rdx, %rcx - mov %rdi, %rax - sub %rsi, %rdi +.Lent: mov %rdi, %rax + mov %rsi, %rdx + sub %rdi, %rdx + sub %rsi, %rdi jnz .Ltop .Lend: ret - .size __gmpn_gcd_11,.-__gmpn_gcd_11 +.Lshift_alot: + shr $7, %rdi + mov %rdi, %rdx + jmp .Lmid + .size __gmpn_gcd_11,.-__gmpn_gcd_11 diff --git a/ext/gmp/gen/x86_64-linux/mpn/gcd_22.s b/ext/gmp/gen/x86_64-linux/mpn/gcd_22.s index e3d86b92e4..60f4c714c9 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/gcd_22.s +++ b/ext/gmp/gen/x86_64-linux/mpn/gcd_22.s @@ -67,6 +67,276 @@ + + + + + + + + + .section .rodata + .align 64, 0x90 +ctz_table: + + .byte 8 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 6 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 7 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 6 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + + .size ctz_table,.-ctz_table @@ -92,32 +362,40 @@ __gmpn_gcd_22: + mov %rcx, %rax + + mov ctz_table@GOTPCREL(%rip), %r10 + + .align 16, 0x90 -.Ltop: mov %rcx, %r10 - sub %rsi, %r10 +.Ltop: mov %rax, %rcx + sub %rsi, %rcx jz .Llowz mov %rdx, %r11 sbb %rdi, %r11 - rep;bsf %r10, %rax - mov %rsi, %r8 - sub %rcx, %rsi mov %rdi, %r9 + + sub %rax, %rsi sbb %rdx, %rdi -.Lbck: cmovc %r10, %rsi +.Lbck: cmovc %rcx, %rsi cmovc %r11, %rdi - cmovc %r8, %rcx + cmovc %r8, %rax cmovc %r9, %rdx - xor %r10d, %r10d - sub %rax, %r10 - .byte 0xc4,98,169,0xf7,207 - .byte 0xc4,226,251,0xf7,246 - .byte 0xc4,226,251,0xf7,255 - or %r9, %rsi + and $255, %ecx + movzbl (%r10,%rcx), %ecx + jz .Lcount_better + +.Lshr: shr %cl, %rsi + mov %rdi, %r11 + shr %cl, %rdi + neg %rcx + shl %cl, %r11 + or %r11, %rsi test %rdx, %rdx jnz .Ltop @@ -125,29 +403,32 @@ __gmpn_gcd_22: jnz .Ltop .Lgcd_11: - mov %rcx, %rdi + mov %rax, %rdi jmp __gmpn_gcd_11@PLT +.Lcount_better: + rep;bsf %rsi, %rcx + jmp .Lshr + .Llowz: - mov %rdx, %r10 - sub %rdi, %r10 + mov %rdx, %rcx + sub %rdi, %rcx je .Lend xor %r11, %r11 mov %rsi, %r8 mov %rdi, %r9 - rep;bsf %r10, %rax mov %rdi, %rsi xor %rdi, %rdi sub %rdx, %rsi jmp .Lbck -.Lend: mov %rcx, %rax +.Lend: + -.Lret: ret .size __gmpn_gcd_22,.-__gmpn_gcd_22 diff --git a/ext/gmp/gen/x86_64-linux/mpn/hamdist.s b/ext/gmp/gen/x86_64-linux/mpn/hamdist.s index 1c5d6e4192..1ab3a8cca6 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/hamdist.s +++ b/ext/gmp/gen/x86_64-linux/mpn/hamdist.s @@ -60,21 +60,16 @@ - - - - - - - - - - - - - - - + + + + + + + + + + @@ -91,119 +86,82 @@ __gmpn_hamdist: - + push %rbx + mov $0x5555555555555555, %r10 push %rbp - - mov (%rdi), %r10 - xor (%rsi), %r10 - - mov %edx, %r8d - and $3, %r8d - - xor %ecx, %ecx - .byte 0xf3,0x49,0x0f,0xb8,0xc2 - - lea .Ltab(%rip), %r9 - - movslq (%r9,%r8,4), %r8 - add %r9, %r8 - jmp *%r8 - - -.L3: mov 8(%rdi), %r10 - mov 16(%rdi), %r11 - xor 8(%rsi), %r10 - xor 16(%rsi), %r11 - xor %ebp, %ebp - sub $4, %rdx - jle .Lx3 - mov 24(%rdi), %r8 - mov 32(%rdi), %r9 - add $24, %rdi - add $24, %rsi - jmp .Le3 - -.L0: mov 8(%rdi), %r9 - xor 8(%rsi), %r9 - mov 16(%rdi), %r10 - mov 24(%rdi), %r11 - xor %ebx, %ebx - xor 16(%rsi), %r10 - xor 24(%rsi), %r11 - add $32, %rdi - add $32, %rsi - sub $4, %rdx - jle .Lx4 + mov $0x3333333333333333, %r11 + push %r12 + lea (%rdi,%rdx,8), %rdi + mov $0x0f0f0f0f0f0f0f0f, %rcx + lea (%rsi,%rdx,8), %rsi + neg %rdx + mov $0x0101010101010101, %r12 + xor %eax, %eax + test $1, %dl + jz .Ltop + + mov (%rdi,%rdx,8), %r8 + xor (%rsi,%rdx,8), %r8 + + mov %r8, %r9 + shr %r8 + and %r10, %r8 + sub %r8, %r9 + + mov %r9, %r8 + shr $2, %r9 + and %r11, %r8 + and %r11, %r9 + add %r8, %r9 + + dec %rdx + jmp .Lmid .align 16, 0x90 -.Ltop: -.Le0: .byte 0xf3,0x49,0x0f,0xb8,0xe9 - mov (%rdi), %r8 - mov 8(%rdi), %r9 - add %rbx, %rax -.Le3: .byte 0xf3,0x49,0x0f,0xb8,0xda - xor (%rsi), %r8 - xor 8(%rsi), %r9 - add %rbp, %rcx -.Le2: .byte 0xf3,0x49,0x0f,0xb8,0xeb - mov 16(%rdi), %r10 - mov 24(%rdi), %r11 - add $32, %rdi - add %rbx, %rax -.Le1: .byte 0xf3,0x49,0x0f,0xb8,0xd8 - xor 16(%rsi), %r10 - xor 24(%rsi), %r11 - add $32, %rsi - add %rbp, %rcx - sub $4, %rdx - jg .Ltop - -.Lx4: .byte 0xf3,0x49,0x0f,0xb8,0xe9 - add %rbx, %rax -.Lx3: .byte 0xf3,0x49,0x0f,0xb8,0xda - add %rbp, %rcx - .byte 0xf3,0x49,0x0f,0xb8,0xeb - add %rbx, %rax - add %rbp, %rcx -.Lx2: add %rcx, %rax -.Lx1: pop %rbp +.Ltop: mov (%rdi,%rdx,8), %r8 + mov 8(%rdi,%rdx,8), %rbx + xor (%rsi,%rdx,8), %r8 + xor 8(%rsi,%rdx,8), %rbx + + mov %r8, %r9 + mov %rbx, %rbp + shr %r8 + shr %rbx + and %r10, %r8 + and %r10, %rbx + sub %r8, %r9 + sub %rbx, %rbp + + mov %r9, %r8 + mov %rbp, %rbx + shr $2, %r9 + shr $2, %rbp + and %r11, %r8 + and %r11, %r9 + and %r11, %rbx + and %r11, %rbp + add %r8, %r9 + add %rbx, %rbp + + add %rbp, %r9 +.Lmid: mov %r9, %r8 + shr $4, %r9 + and %rcx, %r8 + and %rcx, %r9 + add %r8, %r9 + + imul %r12, %r9 + shr $56, %r9 + + add %r9, %rax + add $2, %rdx + jnc .Ltop + +.Lend: + pop %r12 + pop %rbp pop %rbx ret - -.L2: mov 8(%rdi), %r11 - xor 8(%rsi), %r11 - sub $2, %rdx - jle .Ln2 - mov 16(%rdi), %r8 - mov 24(%rdi), %r9 - xor %ebx, %ebx - xor 16(%rsi), %r8 - xor 24(%rsi), %r9 - add $16, %rdi - add $16, %rsi - jmp .Le2 -.Ln2: .byte 0xf3,0x49,0x0f,0xb8,0xcb - jmp .Lx2 - -.L1: dec %rdx - jle .Lx1 - mov 8(%rdi), %r8 - mov 16(%rdi), %r9 - xor 8(%rsi), %r8 - xor 16(%rsi), %r9 - xor %ebp, %ebp - mov 24(%rdi), %r10 - mov 32(%rdi), %r11 - add $40, %rdi - add $8, %rsi - jmp .Le1 - .size __gmpn_hamdist,.-__gmpn_hamdist - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab diff --git a/ext/gmp/gen/x86_64-linux/mpn/ior_n.s b/ext/gmp/gen/x86_64-linux/mpn/ior_n.s index fc23fd7190..6509f28b3b 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/ior_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/ior_n.s @@ -90,7 +90,6 @@ - .text @@ -103,50 +102,42 @@ __gmpn_ior_n: mov (%rdx), %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: or (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: or (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx +.Lb01: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 -.Lb00: mov 8(%rdx), %r9 - or (%rsi), %r8 - or 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 -.Le10: mov 24(%rdx), %r9 - lea 32(%rdx), %rdx - or 16(%rsi), %r8 - or 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + or (%rsi,%rcx,8), %r8 + or 8(%rsi,%rcx,8), %r9 + nop + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + or 16(%rsi,%rcx,8), %r8 + or 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/iorn_n.s b/ext/gmp/gen/x86_64-linux/mpn/iorn_n.s index e13105d814..b199ca33ff 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/iorn_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/iorn_n.s @@ -92,7 +92,6 @@ - .text @@ -106,54 +105,46 @@ __gmpn_iorn_n: mov (%rdx), %r8 not %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: or (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: or (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 +.Ltop: mov (%rdx,%rcx,8), %r8 not %r8 -.Lb00: mov 8(%rdx), %r9 +.Lb00: mov 8(%rdx,%rcx,8), %r9 not %r9 - or (%rsi), %r8 - or 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 + or (%rsi,%rcx,8), %r8 + or 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 not %r8 -.Le10: mov 24(%rdx), %r9 +.Le10: mov 24(%rdx,%rcx,8), %r9 not %r9 - lea 32(%rdx), %rdx - or 16(%rsi), %r8 - or 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + or 16(%rsi,%rcx,8), %r8 + or 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/lshift.s b/ext/gmp/gen/x86_64-linux/mpn/lshift.s index ebd4035c21..89e9566e3c 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/lshift.s +++ b/ext/gmp/gen/x86_64-linux/mpn/lshift.s @@ -41,32 +41,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -89,123 +63,124 @@ .text - .align 64, 0x90 + .align 32, 0x90 .globl __gmpn_lshift .type __gmpn_lshift,@function __gmpn_lshift: - movd %ecx, %xmm4 - mov $64, %eax - sub %ecx, %eax - movd %eax, %xmm5 - - neg %ecx + neg %ecx mov -8(%rsi,%rdx,8), %rax - shr %cl, %rax - - cmp $3, %rdx - jle .Lbc + shr %cl, %rax - lea (%rdi,%rdx,8), %ecx - test $8, %cl - jz .Lrp_aligned - - - movq -8(%rsi,%rdx,8), %xmm0 - movq -16(%rsi,%rdx,8), %xmm1 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movq %xmm0, -8(%rdi,%rdx,8) - dec %rdx - -.Lrp_aligned: + neg %ecx lea 1(%rdx), %r8d - - and $6, %r8d - jz .Lba0 - cmp $4, %r8d - jz .Lba4 - jc .Lba2 -.Lba6: add $-4, %rdx - jmp .Li56 -.Lba0: add $-6, %rdx - jmp .Li70 -.Lba4: add $-2, %rdx - jmp .Li34 -.Lba2: add $-8, %rdx - jle .Lend - + and $3, %r8d + je .Lrlx + + dec %r8d + jne .L1 + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + jmp .Lrll + +.L1: dec %r8d + je .L1x + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + neg %ecx +.L1x: + cmp $1, %rdx + je .Last + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + mov -24(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + mov %r10, -8(%rdi,%rdx,8) + mov %r11, -16(%rdi,%rdx,8) + sub $2, %rdx + +.Lrll: neg %ecx +.Lrlx: mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + + sub $4, %rdx + jb .Lend .align 16, 0x90 -.Ltop: movdqu 40(%rsi,%rdx,8), %xmm1 - movdqu 48(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, 48(%rdi,%rdx,8) -.Li70: - movdqu 24(%rsi,%rdx,8), %xmm1 - movdqu 32(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, 32(%rdi,%rdx,8) -.Li56: - movdqu 8(%rsi,%rdx,8), %xmm1 - movdqu 16(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, 16(%rdi,%rdx,8) -.Li34: - movdqu -8(%rsi,%rdx,8), %xmm1 - movdqu (%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, (%rdi,%rdx,8) - sub $8, %rdx - jg .Ltop - -.Lend: test $1, %dl - jnz .Lend8 - - movdqu (%rsi), %xmm1 - pxor %xmm0, %xmm0 - punpcklqdq %xmm1, %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - movdqa %xmm0, (%rdi) +.Ltop: - ret - + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + mov 8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + mov %r10, 24(%rdi,%rdx,8) + mov %r11, 16(%rdi,%rdx,8) + + mov 0(%rsi,%rdx,8), %r8 + mov -8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + shr %cl, %r9 - .align 16, 0x90 -.Lbc: dec %edx - jz .Lend8 - - movq (%rsi,%rdx,8), %xmm1 - movq -8(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, (%rdi,%rdx,8) - sub $2, %edx - jl .Lend8 - movq 8(%rsi), %xmm1 - movq (%rsi), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, 8(%rdi) - -.Lend8:movq (%rsi), %xmm0 - psllq %xmm4, %xmm0 - movq %xmm0, (%rdi) + + neg %ecx + mov 8(%rsi,%rdx,8), %r10 + mov 0(%rsi,%rdx,8), %r11 + shl %cl, %r10 + or %r10, %r8 + shl %cl, %r11 + or %r11, %r9 + mov %r8, 8(%rdi,%rdx,8) + mov %r9, 0(%rdi,%rdx,8) + + mov -8(%rsi,%rdx,8), %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r10 + shl %cl, %r11 + + sub $4, %rdx + jae .Ltop +.Lend: + neg %ecx + mov 8(%rsi), %r8 + shr %cl, %r8 + or %r8, %r10 + mov (%rsi), %r9 + shr %cl, %r9 + or %r9, %r11 + mov %r10, 16(%rdi) + mov %r11, 8(%rdi) + + neg %ecx +.Last: mov (%rsi), %r10 + shl %cl, %r10 + mov %r10, (%rdi) ret .size __gmpn_lshift,.-__gmpn_lshift - diff --git a/ext/gmp/gen/x86_64-linux/mpn/lshiftc.s b/ext/gmp/gen/x86_64-linux/mpn/lshiftc.s index 1ed069b688..680994041a 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/lshiftc.s +++ b/ext/gmp/gen/x86_64-linux/mpn/lshiftc.s @@ -41,32 +41,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -89,134 +63,135 @@ .text - .align 64, 0x90 + .align 32, 0x90 .globl __gmpn_lshiftc .type __gmpn_lshiftc,@function __gmpn_lshiftc: - movd %ecx, %xmm4 - mov $64, %eax - sub %ecx, %eax - movd %eax, %xmm5 - - neg %ecx + neg %ecx mov -8(%rsi,%rdx,8), %rax - shr %cl, %rax - - pcmpeqb %xmm3, %xmm3 - - cmp $3, %rdx - jle .Lbc - - lea (%rdi,%rdx,8), %ecx - test $8, %cl - jz .Lrp_aligned - - - movq -8(%rsi,%rdx,8), %xmm0 - movq -16(%rsi,%rdx,8), %xmm1 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, -8(%rdi,%rdx,8) - dec %rdx + shr %cl, %rax -.Lrp_aligned: + neg %ecx lea 1(%rdx), %r8d - - and $6, %r8d - jz .Lba0 - cmp $4, %r8d - jz .Lba4 - jc .Lba2 -.Lba6: add $-4, %rdx - jmp .Li56 -.Lba0: add $-6, %rdx - jmp .Li70 -.Lba4: add $-2, %rdx - jmp .Li34 -.Lba2: add $-8, %rdx - jle .Lend - + and $3, %r8d + je .Lrlx + + dec %r8d + jne .L1 + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + not %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + jmp .Lrll + +.L1: dec %r8d + je .L1x + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + not %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + neg %ecx +.L1x: + cmp $1, %rdx + je .Last + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + mov -24(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, -8(%rdi,%rdx,8) + mov %r11, -16(%rdi,%rdx,8) + sub $2, %rdx + +.Lrll: neg %ecx +.Lrlx: mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + + sub $4, %rdx + jb .Lend .align 16, 0x90 -.Ltop: movdqu 40(%rsi,%rdx,8), %xmm1 - movdqu 48(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, 48(%rdi,%rdx,8) -.Li70: - movdqu 24(%rsi,%rdx,8), %xmm1 - movdqu 32(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, 32(%rdi,%rdx,8) -.Li56: - movdqu 8(%rsi,%rdx,8), %xmm1 - movdqu 16(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, 16(%rdi,%rdx,8) -.Li34: - movdqu -8(%rsi,%rdx,8), %xmm1 - movdqu (%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, (%rdi,%rdx,8) - sub $8, %rdx - jg .Ltop - -.Lend: test $1, %dl - jnz .Lend8 - - movdqu (%rsi), %xmm1 - pxor %xmm0, %xmm0 - punpcklqdq %xmm1, %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, (%rdi) +.Ltop: - ret - + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + mov 8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, 24(%rdi,%rdx,8) + mov %r11, 16(%rdi,%rdx,8) + + mov 0(%rsi,%rdx,8), %r8 + mov -8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + shr %cl, %r9 - .align 16, 0x90 -.Lbc: dec %edx - jz .Lend8 - - movq (%rsi,%rdx,8), %xmm1 - movq -8(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, (%rdi,%rdx,8) - sub $2, %edx - jl .Lend8 - movq 8(%rsi), %xmm1 - movq (%rsi), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, 8(%rdi) - -.Lend8:movq (%rsi), %xmm0 - psllq %xmm4, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, (%rdi) + + neg %ecx + mov 8(%rsi,%rdx,8), %r10 + mov 0(%rsi,%rdx,8), %r11 + shl %cl, %r10 + or %r10, %r8 + shl %cl, %r11 + or %r11, %r9 + not %r8 + not %r9 + mov %r8, 8(%rdi,%rdx,8) + mov %r9, 0(%rdi,%rdx,8) + + mov -8(%rsi,%rdx,8), %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r10 + shl %cl, %r11 + + sub $4, %rdx + jae .Ltop +.Lend: + neg %ecx + mov 8(%rsi), %r8 + shr %cl, %r8 + or %r8, %r10 + mov (%rsi), %r9 + shr %cl, %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, 16(%rdi) + mov %r11, 8(%rdi) + + neg %ecx +.Last: mov (%rsi), %r10 + shl %cl, %r10 + not %r10 + mov %r10, (%rdi) ret .size __gmpn_lshiftc,.-__gmpn_lshiftc - diff --git a/ext/gmp/gen/x86_64-linux/mpn/mul_1.s b/ext/gmp/gen/x86_64-linux/mpn/mul_1.s index e8de366075..1644074e4d 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/mul_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/mul_1.s @@ -66,8 +66,6 @@ - - @@ -81,120 +79,127 @@ + .text - .align 32, 0x90 + .align 16, 0x90 + .globl __gmpn_mul_1c + .type __gmpn_mul_1c,@function + +__gmpn_mul_1c: + + + + + push %rbx + mov %r8, %r10 + + jmp .Lcommon + .size __gmpn_mul_1c,.-__gmpn_mul_1c + .globl __gmpn_mul_1 .type __gmpn_mul_1,@function __gmpn_mul_1: - mov %rcx, %r10 - mov %rdx, %rcx - mov %edx, %r8d - shr $3, %rcx - and $7, %r8d - mov %r10, %rdx - lea .Ltab(%rip), %r10 - movslq (%r10,%r8,4), %r8 - lea (%r8, %r10), %r10 - jmp *%r10 - - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .Lf0-.Ltab - .long .Lf1-.Ltab - .long .Lf2-.Ltab - .long .Lf3-.Ltab - .long .Lf4-.Ltab - .long .Lf5-.Ltab - .long .Lf6-.Ltab - .long .Lf7-.Ltab - .text -.Lf0: .byte 0xc4,98,171,0xf6,6 - lea 56(%rsi), %rsi - lea -8(%rdi), %rdi - jmp .Lb0 - -.Lf3: .byte 0xc4,226,179,0xf6,6 - lea 16(%rsi), %rsi - lea 16(%rdi), %rdi - inc %rcx - jmp .Lb3 - -.Lf4: .byte 0xc4,98,171,0xf6,6 - lea 24(%rsi), %rsi - lea 24(%rdi), %rdi - inc %rcx - jmp .Lb4 - -.Lf5: .byte 0xc4,226,179,0xf6,6 - lea 32(%rsi), %rsi - lea 32(%rdi), %rdi - inc %rcx - jmp .Lb5 - -.Lf6: .byte 0xc4,98,171,0xf6,6 - lea 40(%rsi), %rsi - lea 40(%rdi), %rdi - inc %rcx - jmp .Lb6 - -.Lf7: .byte 0xc4,226,179,0xf6,6 - lea 48(%rsi), %rsi - lea 48(%rdi), %rdi - inc %rcx - jmp .Lb7 - -.Lf1: .byte 0xc4,226,179,0xf6,6 - test %rcx, %rcx - jnz .Lb1 -.L1: mov %r9, (%rdi) - ret -.Lf2: .byte 0xc4,98,171,0xf6,6 - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - .byte 0xc4,226,179,0xf6,6 - test %rcx, %rcx - jz .Lend - - .align 32, 0x90 -.Ltop: mov %r10, -8(%rdi) - adc %r8, %r9 -.Lb1: .byte 0xc4,98,171,0xf6,70,8 - adc %rax, %r10 - lea 64(%rsi), %rsi - mov %r9, (%rdi) -.Lb0: mov %r10, 8(%rdi) - .byte 0xc4,226,179,0xf6,70,208 - lea 64(%rdi), %rdi - adc %r8, %r9 -.Lb7: .byte 0xc4,98,171,0xf6,70,216 - mov %r9, -48(%rdi) - adc %rax, %r10 -.Lb6: mov %r10, -40(%rdi) - .byte 0xc4,226,179,0xf6,70,224 - adc %r8, %r9 -.Lb5: .byte 0xc4,98,171,0xf6,70,232 - mov %r9, -32(%rdi) - adc %rax, %r10 -.Lb4: .byte 0xc4,226,179,0xf6,70,240 - mov %r10, -24(%rdi) - adc %r8, %r9 -.Lb3: .byte 0xc4,98,171,0xf6,70,248 - adc %rax, %r10 - mov %r9, -16(%rdi) - dec %rcx - .byte 0xc4,226,179,0xf6,6 - jnz .Ltop - -.Lend: mov %r10, -8(%rdi) - adc %r8, %r9 - mov %r9, (%rdi) - adc %rcx, %rax + + push %rbx + xor %r10, %r10 +.Lcommon: + mov (%rsi), %rax + mov %rdx, %rbx + + mul %rcx + mov %rbx, %r11 + + add %r10, %rax + adc $0, %rdx + + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jz .Lb2 + jg .Lb3 + +.Lb1: dec %r11 + jne .Lgt1 + mov %rax, (%rdi) + jmp .Lret +.Lgt1: lea 8(%rsi,%r11,8), %rsi + lea -8(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (%rsi,%r11,8), %rax + mov %rdx, %r8 + jmp .LL1 + +.Lb0: lea (%rsi,%r11,8), %rsi + lea -16(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp .LL0 + +.Lb3: lea -8(%rsi,%r11,8), %rsi + lea -24(%rdi,%r11,8), %rdi + neg %r11 + mov %rax, %rbx + mov %rdx, %r10 + jmp .LL3 + +.Lb2: lea -16(%rsi,%r11,8), %rsi + lea -32(%rdi,%r11,8), %rdi + neg %r11 + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %rdx, %r9 + jmp .LL2 + + .align 16, 0x90 +.Ltop: mov %r10, (%rdi,%r11,8) + add %rax, %r9 + mov (%rsi,%r11,8), %rax + adc %rdx, %r8 + mov $0, %r10d +.LL1: mul %rcx + mov %r9, 8(%rdi,%r11,8) + add %rax, %r8 + adc %rdx, %rbx +.LL0: mov 8(%rsi,%r11,8), %rax + mul %rcx + mov %r8, 16(%rdi,%r11,8) + add %rax, %rbx + adc %rdx, %r10 +.LL3: mov 16(%rsi,%r11,8), %rax + mul %rcx + mov %rbx, 24(%rdi,%r11,8) + mov $0, %r8d + mov %r8, %rbx + add %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %r8, %r9 + adc %rdx, %r9 +.LL2: mul %rcx + add $4, %r11 + js .Ltop + + mov %r10, (%rdi,%r11,8) + add %rax, %r9 + adc %r8, %rdx + mov %r9, 8(%rdi,%r11,8) + add %r8, %rdx +.Lret: mov %rdx, %rax + + pop %rbx + + ret .size __gmpn_mul_1,.-__gmpn_mul_1 - diff --git a/ext/gmp/gen/x86_64-linux/mpn/mul_2.s b/ext/gmp/gen/x86_64-linux/mpn/mul_2.s index 395391597e..0c3310dfad 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/mul_2.s +++ b/ext/gmp/gen/x86_64-linux/mpn/mul_2.s @@ -81,13 +81,17 @@ + + + + .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_mul_2 .type __gmpn_mul_2,@function @@ -100,88 +104,112 @@ __gmpn_mul_2: mov (%rcx), %r8 mov 8(%rcx), %r9 - lea 3(%rdx), %r11 - shr $2, %r11 - - test $1, %dl - jnz .Lbx1 + mov (%rsi), %rax + + mov %rdx, %r11 + neg %r11 + lea -8(%rsi,%rdx,8), %rsi + lea -8(%rdi,%rdx,8), %rdi + + and $3, %edx + jz .Lm2p0 + cmp $2, %edx + jc .Lm2p1 + jz .Lm2p2 +.Lm2p3: + mul %r8 + xor %r10d, %r10d + mov %rax, %rcx + mov %rdx, %rbp + mov 8(%rsi,%r11,8), %rax + add $-1, %r11 + mul %r9 + add %rax, %rbp + jmp .Lm23 +.Lm2p0: + mul %r8 + xor %ebp, %ebp + mov %rax, %rbx + mov %rdx, %rcx + jmp .Lm20 +.Lm2p1: + mul %r8 + xor %r10d, %r10d + xor %ebx, %ebx + xor %ecx, %ecx + add $1, %r11 + jmp .Lm2top +.Lm2p2: + mul %r8 + xor %ebx, %ebx + xor %ecx, %ecx + mov %rax, %rbp + mov %rdx, %r10 + mov 8(%rsi,%r11,8), %rax + add $-2, %r11 + jmp .Lm22 -.Lbx0: xor %rbx, %rbx - test $2, %dl - mov (%rsi), %rdx - .byte 0xc4,194,211,0xf6,200 - jz .Llo0 -.Lb10: lea -16(%rdi), %rdi - lea -16(%rsi), %rsi - jmp .Llo2 - -.Lbx1: xor %rbp, %rbp - test $2, %dl - mov (%rsi), %rdx - .byte 0xc4,66,227,0xf6,208 - jnz .Lb11 - -.Lb01: lea -24(%rdi), %rdi - lea 8(%rsi), %rsi - jmp .Llo1 - -.Lb11: lea -8(%rdi), %rdi - lea -8(%rsi), %rsi - jmp .Llo3 - - .align 16, 0x90 -.Ltop: .byte 0xc4,194,251,0xf6,217 - add %rax, %rbp - mov (%rsi), %rdx - .byte 0xc4,194,251,0xf6,200 - adc $0, %rbx - add %rax, %rbp - adc $0, %rcx - add %r10, %rbp -.Llo0: mov %rbp, (%rdi) - adc $0, %rcx - .byte 0xc4,194,251,0xf6,233 - add %rax, %rbx - mov 8(%rsi), %rdx - adc $0, %rbp - .byte 0xc4,66,251,0xf6,208 - add %rax, %rbx - adc $0, %r10 - add %rcx, %rbx -.Llo3: mov %rbx, 8(%rdi) - adc $0, %r10 - .byte 0xc4,194,251,0xf6,217 - add %rax, %rbp - mov 16(%rsi), %rdx - .byte 0xc4,194,251,0xf6,200 - adc $0, %rbx - add %rax, %rbp - adc $0, %rcx - add %r10, %rbp -.Llo2: mov %rbp, 16(%rdi) - adc $0, %rcx - .byte 0xc4,194,251,0xf6,233 - add %rax, %rbx - mov 24(%rsi), %rdx - adc $0, %rbp - .byte 0xc4,66,251,0xf6,208 - add %rax, %rbx - adc $0, %r10 - add %rcx, %rbx - lea 32(%rsi), %rsi -.Llo1: mov %rbx, 24(%rdi) - adc $0, %r10 - dec %r11 - lea 32(%rdi), %rdi - jnz .Ltop - -.Lend: .byte 0xc4,194,235,0xf6,193 - add %rdx, %rbp - adc $0, %rax - add %r10, %rbp - mov %rbp, (%rdi) - adc $0, %rax + .align 32, 0x90 +.Lm2top: + add %rax, %r10 + adc %rdx, %rbx + mov 0(%rsi,%r11,8), %rax + adc $0, %ecx + mov $0, %ebp + mul %r9 + add %rax, %rbx + mov %r10, 0(%rdi,%r11,8) + adc %rdx, %rcx + mov 8(%rsi,%r11,8), %rax + mul %r8 + add %rax, %rbx + adc %rdx, %rcx + adc $0, %ebp +.Lm20: mov 8(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rcx + adc %rdx, %rbp + mov 16(%rsi,%r11,8), %rax + mov $0, %r10d + mul %r8 + add %rax, %rcx + mov 16(%rsi,%r11,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r9 + add %rax, %rbp + mov %rbx, 8(%rdi,%r11,8) +.Lm23: adc %rdx, %r10 + mov 24(%rsi,%r11,8), %rax + mul %r8 + mov $0, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov %rcx, 16(%rdi,%r11,8) + mov 24(%rsi,%r11,8), %rax + mov $0, %ecx + adc $0, %ebx +.Lm22: mul %r9 + add %rax, %r10 + mov %rbp, 24(%rdi,%r11,8) + adc %rdx, %rbx + mov 32(%rsi,%r11,8), %rax + mul %r8 + add $4, %r11 + js .Lm2top + + + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + mov (%rsi), %rax + mul %r9 + mov %r10, (%rdi) + add %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%rdi) + mov %rcx, %rax pop %rbp pop %rbx diff --git a/ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s b/ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s index 498782526f..2cfb7aaa17 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s +++ b/ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s @@ -84,295 +84,400 @@ + .text + .align 16, 0x90 + .globl __gmpn_mul_basecase + .type __gmpn_mul_basecase,@function + +__gmpn_mul_basecase: + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + xor %r13d, %r13d + mov (%rsi), %rax + mov (%rcx), %r12 + sub %rdx, %r13 + mov %r13, %r11 + mov %edx, %ebx + lea (%rdi,%rdx,8), %rdi + lea (%rsi,%rdx,8), %rsi + mul %r12 + test $1, %r8b + jz .Lmul_2 - .text - .align 16, 0x90 - .globl __gmpn_mul_basecase - .type __gmpn_mul_basecase,@function - -__gmpn_mul_basecase: - +.Lmul_1: + and $3, %ebx + jz .Lmul_1_prologue_0 + cmp $2, %ebx + jc .Lmul_1_prologue_1 + jz .Lmul_1_prologue_2 - cmp $2, %rdx - ja .Lgen - mov (%rcx), %rdx - .byte 0xc4,98,251,0xf6,14 - je .Ls2x +.Lmul_1_prologue_3: + add $-1, %r11 + lea .Laddmul_outer_3(%rip), %r14 + mov %rax, %r10 + mov %rdx, %rbx + jmp .Lmul_1_entry_3 -.Ls11: mov %rax, (%rdi) - mov %r9, 8(%rdi) - - ret +.Lmul_1_prologue_0: + mov %rax, %rbp + mov %rdx, %r10 + lea .Laddmul_outer_0(%rip), %r14 + jmp .Lmul_1_entry_0 + +.Lmul_1_prologue_1: + cmp $-1, %r13 + jne 2f + mov %rax, -8(%rdi) + mov %rdx, (%rdi) + jmp .Lret +2: add $1, %r11 + lea .Laddmul_outer_1(%rip), %r14 + mov %rax, %r15 + mov %rdx, %rbp + xor %r10d, %r10d + mov (%rsi,%r11,8), %rax + jmp .Lmul_1_entry_1 + +.Lmul_1_prologue_2: + add $-2, %r11 + lea .Laddmul_outer_2(%rip), %r14 + mov %rax, %rbx + mov %rdx, %r15 + mov 24(%rsi,%r11,8), %rax + xor %ebp, %ebp + xor %r10d, %r10d + jmp .Lmul_1_entry_2 -.Ls2x: cmp $2, %r8 - .byte 0xc4,98,187,0xf6,86,8 - je .Ls22 -.Ls21: add %r8, %r9 - adc $0, %r10 - mov %rax, (%rdi) - mov %r9, 8(%rdi) - mov %r10, 16(%rdi) - ret -.Ls22: add %r8, %r9 - adc $0, %r10 - mov 8(%rcx), %rdx - mov %rax, (%rdi) - .byte 0xc4,98,187,0xf6,30 - .byte 0xc4,226,251,0xf6,86,8 - add %r11, %rax - adc $0, %rdx - add %r8, %r9 - adc %rax, %r10 - adc $0, %rdx - mov %r9, 8(%rdi) - mov %r10, 16(%rdi) - mov %rdx, 24(%rdi) + .align 16, 0x90 +.Lmul_1_top: + mov %rbx, -16(%rdi,%r11,8) + add %rax, %r15 + mov (%rsi,%r11,8), %rax + adc %rdx, %rbp +.Lmul_1_entry_1: + xor %ebx, %ebx + mul %r12 + mov %r15, -8(%rdi,%r11,8) + add %rax, %rbp + adc %rdx, %r10 +.Lmul_1_entry_0: + mov 8(%rsi,%r11,8), %rax + mul %r12 + mov %rbp, (%rdi,%r11,8) + add %rax, %r10 + adc %rdx, %rbx +.Lmul_1_entry_3: + mov 16(%rsi,%r11,8), %rax + mul %r12 + mov %r10, 8(%rdi,%r11,8) + xor %ebp, %ebp + mov %rbp, %r10 + add %rax, %rbx + mov 24(%rsi,%r11,8), %rax + mov %rbp, %r15 + adc %rdx, %r15 +.Lmul_1_entry_2: + mul %r12 + add $4, %r11 + js .Lmul_1_top + + mov %rbx, -16(%rdi) + add %rax, %r15 + mov %r15, -8(%rdi) + adc %rdx, %rbp + mov %rbp, (%rdi) + + add $-1, %r8 + jz .Lret + + mov 8(%rcx), %r12 + mov 16(%rcx), %r9 + + lea 8(%rcx), %rcx + lea 8(%rdi), %rdi + + jmp *%r14 + + + + + .align 16, 0x90 +.Lmul_2: + mov 8(%rcx), %r9 + + and $3, %ebx + jz .Lmul_2_prologue_0 + cmp $2, %ebx + jz .Lmul_2_prologue_2 + jc .Lmul_2_prologue_1 + +.Lmul_2_prologue_3: + lea .Laddmul_outer_3(%rip), %r14 + add $2, %r11 + mov %rax, -16(%rdi,%r11,8) + mov %rdx, %rbp + xor %r10d, %r10d + xor %ebx, %ebx + mov -16(%rsi,%r11,8), %rax + jmp .Lmul_2_entry_3 + + .align 16, 0x90 +.Lmul_2_prologue_0: + add $3, %r11 + mov %rax, %rbx + mov %rdx, %r15 + xor %ebp, %ebp + mov -24(%rsi,%r11,8), %rax + lea .Laddmul_outer_0(%rip), %r14 + jmp .Lmul_2_entry_0 + + .align 16, 0x90 +.Lmul_2_prologue_1: + mov %rax, %r10 + mov %rdx, %rbx + xor %r15d, %r15d + lea .Laddmul_outer_1(%rip), %r14 + jmp .Lmul_2_entry_1 + + .align 16, 0x90 +.Lmul_2_prologue_2: + add $1, %r11 + lea .Laddmul_outer_2(%rip), %r14 + mov $0, %ebx + mov $0, %r15d + mov %rax, %rbp + mov -8(%rsi,%r11,8), %rax + mov %rdx, %r10 + jmp .Lmul_2_entry_2 + - ret .align 16, 0x90 -.Lgen: - push %rbx - push %rbp - push %r12 - push %r14 +.Lmul_2_top: + mov -32(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rbx + adc %rdx, %r15 + mov -24(%rsi,%r11,8), %rax + xor %ebp, %ebp + mul %r12 + add %rax, %rbx + mov -24(%rsi,%r11,8), %rax + adc %rdx, %r15 + adc $0, %ebp +.Lmul_2_entry_0: + mul %r9 + add %rax, %r15 + mov %rbx, -24(%rdi,%r11,8) + adc %rdx, %rbp + mov -16(%rsi,%r11,8), %rax + mul %r12 + mov $0, %r10d + add %rax, %r15 + adc %rdx, %rbp + mov -16(%rsi,%r11,8), %rax + adc $0, %r10d + mov $0, %ebx + mov %r15, -16(%rdi,%r11,8) +.Lmul_2_entry_3: + mul %r9 + add %rax, %rbp + mov -8(%rsi,%r11,8), %rax + adc %rdx, %r10 + mov $0, %r15d + mul %r12 + add %rax, %rbp + mov -8(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc %r15d, %ebx +.Lmul_2_entry_2: + mul %r9 + add %rax, %r10 + mov %rbp, -8(%rdi,%r11,8) + adc %rdx, %rbx + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %r15d +.Lmul_2_entry_1: + add $4, %r11 + mov %r10, -32(%rdi,%r11,8) + js .Lmul_2_top + + mov -32(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rbx + mov %rbx, (%rdi) + adc %rdx, %r15 + mov %r15, 8(%rdi) + + add $-2, %r8 + jz .Lret + + mov 16(%rcx), %r12 + mov 24(%rcx), %r9 + + lea 16(%rcx), %rcx + lea 16(%rdi), %rdi + + jmp *%r14 - mov %rcx, %r14 - lea 1(%rdx), %rbx + + + + + + + +.Laddmul_outer_0: + add $3, %r13 + lea 0(%rip), %r14 + + mov %r13, %r11 + mov -24(%rsi,%r13,8), %rax + mul %r12 + mov %rax, %rbx + mov -24(%rsi,%r13,8), %rax + mov %rdx, %r15 + xor %ebp, %ebp + jmp .Laddmul_entry_0 + +.Laddmul_outer_1: + mov %r13, %r11 + mov (%rsi,%r13,8), %rax + mul %r12 + mov %rax, %r10 + mov (%rsi,%r13,8), %rax + mov %rdx, %rbx + xor %r15d, %r15d + jmp .Laddmul_entry_1 + +.Laddmul_outer_2: + add $1, %r13 + lea 0(%rip), %r14 + + mov %r13, %r11 + mov -8(%rsi,%r13,8), %rax + mul %r12 + xor %ebx, %ebx + mov %rax, %rbp + xor %r15d, %r15d + mov %rdx, %r10 + mov -8(%rsi,%r13,8), %rax + jmp .Laddmul_entry_2 + +.Laddmul_outer_3: + add $2, %r13 + lea 0(%rip), %r14 + + mov %r13, %r11 + mov -16(%rsi,%r13,8), %rax + xor %r10d, %r10d + mul %r12 + mov %rax, %r15 + mov -16(%rsi,%r13,8), %rax mov %rdx, %rbp - mov %edx, %eax - and $-8, %rbx - shr $3, %rbp - neg %rbx - and $7, %eax - - mov %rbp, %rcx - mov (%r14), %rdx - lea 8(%r14), %r14 - - lea .Lmtab(%rip), %r10 - movslq (%r10,%rax,4), %r11 - lea (%r11, %r10), %r10 - jmp *%r10 - - -.Lmf0: .byte 0xc4,98,171,0xf6,30 - lea 56(%rsi), %rsi - lea -8(%rdi), %rdi - jmp .Lmb0 - -.Lmf3: .byte 0xc4,98,155,0xf6,14 - lea 16(%rsi), %rsi - lea 16(%rdi), %rdi - inc %rcx - jmp .Lmb3 - -.Lmf4: .byte 0xc4,98,171,0xf6,30 - lea 24(%rsi), %rsi - lea 24(%rdi), %rdi - inc %rcx - jmp .Lmb4 - -.Lmf5: .byte 0xc4,98,155,0xf6,14 - lea 32(%rsi), %rsi - lea 32(%rdi), %rdi - inc %rcx - jmp .Lmb5 - -.Lmf6: .byte 0xc4,98,171,0xf6,30 - lea 40(%rsi), %rsi - lea 40(%rdi), %rdi - inc %rcx - jmp .Lmb6 - -.Lmf7: .byte 0xc4,98,155,0xf6,14 - lea 48(%rsi), %rsi - lea 48(%rdi), %rdi - inc %rcx - jmp .Lmb7 - -.Lmf1: .byte 0xc4,98,155,0xf6,14 - jmp .Lmb1 - -.Lmf2: .byte 0xc4,98,171,0xf6,30 - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - .byte 0xc4,98,155,0xf6,14 + jmp .Laddmul_entry_3 + + + + .align 16, 0x90 +.Laddmul_top: + add %r10, -32(%rdi,%r11,8) + adc %rax, %rbx + mov -24(%rsi,%r11,8), %rax + adc %rdx, %r15 + xor %ebp, %ebp + mul %r12 + add %rax, %rbx + mov -24(%rsi,%r11,8), %rax + adc %rdx, %r15 + adc %ebp, %ebp +.Laddmul_entry_0: + mul %r9 + xor %r10d, %r10d + add %rbx, -24(%rdi,%r11,8) + adc %rax, %r15 + mov -16(%rsi,%r11,8), %rax + adc %rdx, %rbp + mul %r12 + add %rax, %r15 + mov -16(%rsi,%r11,8), %rax + adc %rdx, %rbp + adc $0, %r10d +.Laddmul_entry_3: + mul %r9 + add %r15, -16(%rdi,%r11,8) + adc %rax, %rbp + mov -8(%rsi,%r11,8), %rax + adc %rdx, %r10 + mul %r12 + xor %ebx, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %r15d + mov -8(%rsi,%r11,8), %rax + adc %r15d, %ebx +.Laddmul_entry_2: + mul %r9 + add %rbp, -8(%rdi,%r11,8) + adc %rax, %r10 + adc %rdx, %rbx + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + mov (%rsi,%r11,8), %rax + adc %rdx, %rbx + adc $0, %r15d +.Laddmul_entry_1: + mul %r9 + add $4, %r11 + js .Laddmul_top + + add %r10, -8(%rdi) + adc %rax, %rbx + mov %rbx, (%rdi) + adc %rdx, %r15 + mov %r15, 8(%rdi) + + add $-2, %r8 + jz .Lret + + lea 16(%rdi), %rdi + lea 16(%rcx), %rcx + + mov (%rcx), %r12 + mov 8(%rcx), %r9 + + jmp *%r14 .align 16, 0x90 -.Lm1top: - mov %r10, -8(%rdi) - adc %r11, %r12 -.Lmb1: .byte 0xc4,98,171,0xf6,94,8 - adc %r9, %r10 - lea 64(%rsi), %rsi - mov %r12, (%rdi) -.Lmb0: mov %r10, 8(%rdi) - .byte 0xc4,98,155,0xf6,78,208 - lea 64(%rdi), %rdi - adc %r11, %r12 -.Lmb7: .byte 0xc4,98,171,0xf6,94,216 - mov %r12, -48(%rdi) - adc %r9, %r10 -.Lmb6: mov %r10, -40(%rdi) - .byte 0xc4,98,155,0xf6,78,224 - adc %r11, %r12 -.Lmb5: .byte 0xc4,98,171,0xf6,94,232 - mov %r12, -32(%rdi) - adc %r9, %r10 -.Lmb4: .byte 0xc4,98,155,0xf6,78,240 - mov %r10, -24(%rdi) - adc %r11, %r12 -.Lmb3: .byte 0xc4,98,171,0xf6,94,248 - adc %r9, %r10 - mov %r12, -16(%rdi) - dec %rcx - .byte 0xc4,98,155,0xf6,14 - jnz .Lm1top - -.Lm1end: - mov %r10, -8(%rdi) - adc %r11, %r12 - mov %r12, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - - dec %r8 - jz .Ldone - - lea .Latab(%rip), %r10 - movslq (%r10,%rax,4), %rax - lea (%rax, %r10), %rax - - -.Louter: - lea (%rsi,%rbx,8), %rsi - mov %rbp, %rcx - mov (%r14), %rdx - lea 8(%r14), %r14 - jmp *%rax - -.Lf0: .byte 0xc4,98,171,0xf6,94,8 - lea 8(%rdi,%rbx,8), %rdi - lea -1(%rcx), %rcx - jmp .Lb0 - -.Lf3: .byte 0xc4,98,155,0xf6,78,240 - lea -56(%rdi,%rbx,8), %rdi - jmp .Lb3 - -.Lf4: .byte 0xc4,98,171,0xf6,94,232 - lea -56(%rdi,%rbx,8), %rdi - jmp .Lb4 - -.Lf5: .byte 0xc4,98,155,0xf6,78,224 - lea -56(%rdi,%rbx,8), %rdi - jmp .Lb5 - -.Lf6: .byte 0xc4,98,171,0xf6,94,216 - lea -56(%rdi,%rbx,8), %rdi - jmp .Lb6 - -.Lf7: .byte 0xc4,98,155,0xf6,78,16 - lea 8(%rdi,%rbx,8), %rdi - jmp .Lb7 - -.Lf1: .byte 0xc4,98,155,0xf6,14 - lea 8(%rdi,%rbx,8), %rdi - jmp .Lb1 - -.Lam1end: - .byte 0xf3,76,0x0f,0x38,0xf6,39 - .byte 0xf3,76,0x0f,0x38,0xf6,201 - mov %r12, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - - dec %r8 - jnz .Louter -.Ldone: +.Lret: pop %r15 pop %r14 + pop %r13 pop %r12 pop %rbp pop %rbx ret -.Lf2: - .byte 0xc4,98,171,0xf6,94,248 - lea 8(%rdi,%rbx,8), %rdi - .byte 0xc4,98,155,0xf6,14 - - .align 16, 0x90 -.Lam1top: - .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,227 - mov %r10, -8(%rdi) - jrcxz .Lam1end -.Lb1: .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,39 - lea -1(%rcx), %rcx - mov %r12, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 -.Lb0: .byte 0xc4,98,155,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,227 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) -.Lb7: .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,103,16 - mov %r12, 16(%rdi) -.Lb6: .byte 0xc4,98,155,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,227 - mov %r10, 24(%rdi) -.Lb5: .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,103,32 - mov %r12, 32(%rdi) -.Lb4: .byte 0xc4,98,155,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,227 - mov %r10, 40(%rdi) -.Lb3: .byte 0xf3,76,0x0f,0x38,0xf6,103,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r12, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,155,0xf6,14 - jmp .Lam1top - - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Lmtab:.long .Lmf0-.Lmtab - .long .Lmf1-.Lmtab - .long .Lmf2-.Lmtab - .long .Lmf3-.Lmtab - .long .Lmf4-.Lmtab - .long .Lmf5-.Lmtab - .long .Lmf6-.Lmtab - .long .Lmf7-.Lmtab -.Latab:.long .Lf0-.Latab - .long .Lf1-.Latab - .long .Lf2-.Latab - .long .Lf3-.Latab - .long .Lf4-.Latab - .long .Lf5-.Latab - .long .Lf6-.Latab - .long .Lf7-.Latab - .text .size __gmpn_mul_basecase,.-__gmpn_mul_basecase diff --git a/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s b/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s index 81d8b64e47..d76272ca92 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s +++ b/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s @@ -55,6 +55,16 @@ + + + + + + + + + + @@ -67,340 +77,363 @@ .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_mullo_basecase .type __gmpn_mullo_basecase,@function __gmpn_mullo_basecase: - cmp $4, %ecx - jae .Lbig + cmp $4, %rcx + jge .Lgen + mov (%rsi), %rax + mov (%rdx), %r8 - mov %rdx, %r11 - mov (%rsi), %rdx + lea .Ltab(%rip), %r9 + movslq (%r9,%rcx,4), %r10 + add %r10, %r9 + jmp *%r9 + + .section .data.rel.ro.local,"a",@progbits + .align 8, 0x90 +.Ltab: .long .Ltab-.Ltab + .long .L1-.Ltab + .long .L2-.Ltab + .long .L3-.Ltab + .text - cmp $2, %ecx - jae .Lgt1 -.Ln1: imul (%r11), %rdx - mov %rdx, (%rdi) +.L1: imul %r8, %rax + mov %rax, (%rdi) ret -.Lgt1: ja .Lgt2 -.Ln2: mov (%r11), %r9 - .byte 0xc4,194,251,0xf6,209 + +.L2: mov 8(%rdx), %r11 + imul %rax, %r11 + mul %r8 mov %rax, (%rdi) - mov 8(%rsi), %rax - imul %r9, %rax - add %rax, %rdx - mov 8(%r11), %r9 - mov (%rsi), %rcx - imul %r9, %rcx - add %rcx, %rdx - mov %rdx, 8(%rdi) + imul 8(%rsi), %r8 + lea (%r11, %rdx), %rax + add %r8, %rax + mov %rax, 8(%rdi) ret -.Lgt2: -.Ln3: mov (%r11), %r9 - .byte 0xc4,66,251,0xf6,209 - mov %rax, (%rdi) - mov 8(%rsi), %rdx - .byte 0xc4,194,251,0xf6,209 - imul 16(%rsi), %r9 - add %rax, %r10 - adc %rdx, %r9 - mov 8(%r11), %r8 - mov (%rsi), %rdx - .byte 0xc4,194,251,0xf6,208 - add %rax, %r10 + +.L3: mov 8(%rdx), %r9 + mov 16(%rdx), %r11 + mul %r8 + mov %rax, (%rdi) + mov (%rsi), %rax + mov %rdx, %rcx + mul %r9 + imul 8(%rsi), %r9 + mov 16(%rsi), %r10 + imul %r8, %r10 + add %rax, %rcx adc %rdx, %r9 - imul 8(%rsi), %r8 - add %r8, %r9 - mov %r10, 8(%rdi) - mov 16(%r11), %r10 - mov (%rsi), %rax - imul %rax, %r10 add %r10, %r9 + mov 8(%rsi), %rax + mul %r8 + add %rax, %rcx + adc %rdx, %r9 + mov %r11, %rax + imul (%rsi), %rax + add %rax, %r9 + mov %rcx, 8(%rdi) mov %r9, 16(%rdi) ret - .align 16, 0x90 -.Lbig: push %r14 - push %r12 - push %rbx +.L0m4: +.L1m4: +.L2m4: +.L3m4: +.Lgen: push %rbx push %rbp - mov -8(%rdx,%rcx,8), %r14 - imul (%rsi), %r14 - lea -3(%rcx), %ebp - lea 8(%rdx), %r11 - mov (%rdx), %rdx - - mov %ecx, %eax - shr $3, %ecx - and $7, %eax - lea .Lmtab(%rip), %r10 - movslq (%r10,%rax,4), %rax - lea (%rax, %r10), %r10 - jmp *%r10 - - -.Lmf0: .byte 0xc4,98,171,0xf6,6 - lea 56(%rsi), %rsi + push %r13 + push %r14 + push %r15 + + mov (%rsi), %rax + mov (%rdx), %r13 + mov %rdx, %r11 + + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + neg %rcx + + mul %r13 + + test $1, %cl + jz .Lmul_2 + +.Lmul_1: lea -8(%rdi), %rdi - lea .Lf7(%rip), %rbx - jmp .Lmb0 + lea -8(%rsi), %rsi + test $2, %cl + jnz .Lmul_1_prologue_3 + +.Lmul_1_prologue_2: + lea -1(%rcx), %r9 + lea .Laddmul_outer_1(%rip), %r8 + mov %rax, %rbx + mov %rdx, %r15 + xor %ebp, %ebp + xor %r10d, %r10d + mov 16(%rsi,%rcx,8), %rax + jmp .Lmul_1_entry_2 + +.Lmul_1_prologue_3: + lea 1(%rcx), %r9 + lea .Laddmul_outer_3(%rip), %r8 + mov %rax, %rbp + mov %rdx, %r10 + xor %ebx, %ebx + jmp .Lmul_1_entry_0 + + .align 16, 0x90 +.Lmul_1_top: + mov %rbx, -16(%rdi,%r9,8) + add %rax, %r15 + mov (%rsi,%r9,8), %rax + adc %rdx, %rbp + xor %ebx, %ebx + mul %r13 + mov %r15, -8(%rdi,%r9,8) + add %rax, %rbp + adc %rdx, %r10 +.Lmul_1_entry_0: + mov 8(%rsi,%r9,8), %rax + mul %r13 + mov %rbp, (%rdi,%r9,8) + add %rax, %r10 + adc %rdx, %rbx + mov 16(%rsi,%r9,8), %rax + mul %r13 + mov %r10, 8(%rdi,%r9,8) + xor %ebp, %ebp + mov %rbp, %r10 + add %rax, %rbx + mov 24(%rsi,%r9,8), %rax + mov %rbp, %r15 + adc %rdx, %r15 +.Lmul_1_entry_2: + mul %r13 + add $4, %r9 + js .Lmul_1_top + + mov %rbx, -16(%rdi) + add %rax, %r15 + mov %r15, -8(%rdi) + adc %rdx, %rbp + + imul (%rsi), %r13 + add %r13, %rbp + mov %rbp, (%rdi) + + add $1, %rcx + jz .Lret + + mov 8(%r11), %r13 + mov 16(%r11), %r14 -.Lmf3: .byte 0xc4,226,179,0xf6,6 lea 16(%rsi), %rsi - lea 16(%rdi), %rdi - jrcxz .Lmc - inc %ecx - lea .Lf2(%rip), %rbx - jmp .Lmb3 + lea 8(%r11), %r11 + lea 24(%rdi), %rdi + + jmp *%r8 -.Lmc: .byte 0xc4,98,171,0xf6,70,248 + +.Lmul_2: + mov 8(%r11), %r14 + test $2, %cl + jz .Lmul_2_prologue_3 + + .align 16, 0x90 +.Lmul_2_prologue_1: + lea 0(%rcx), %r9 + mov %rax, %r10 + mov %rdx, %rbx + xor %r15d, %r15d + mov (%rsi,%rcx,8), %rax + lea .Laddmul_outer_3(%rip), %r8 + jmp .Lmul_2_entry_1 + + .align 16, 0x90 +.Lmul_2_prologue_3: + lea 2(%rcx), %r9 + mov $0, %r10d + mov %rax, %r15 + mov (%rsi,%rcx,8), %rax + mov %rdx, %rbp + lea .Laddmul_outer_1(%rip), %r8 + jmp .Lmul_2_entry_3 + + .align 16, 0x90 +.Lmul_2_top: + mov -32(%rsi,%r9,8), %rax + mul %r14 + add %rax, %rbx + adc %rdx, %r15 + mov -24(%rsi,%r9,8), %rax + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %r15 + adc $0, %ebp + mul %r14 + add %rax, %r15 + mov %rbx, -24(%rdi,%r9,8) + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + mul %r13 + mov $0, %r10d + add %rax, %r15 + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + adc $0, %r10d +.Lmul_2_entry_3: + mov $0, %ebx + mov %r15, -16(%rdi,%r9,8) + mul %r14 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mov $0, %r15d + mul %r13 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + adc %r15d, %ebx + mul %r14 add %rax, %r10 - mov %r9, -16(%rdi) - .byte 0xc4,226,179,0xf6,6 - mov %r10, -8(%rdi) - adc %r8, %r9 - mov %r9, (%rdi) - jmp .Lc2 - -.Lmf4: .byte 0xc4,98,171,0xf6,6 - lea 24(%rsi), %rsi - lea 24(%rdi), %rdi - inc %ecx - lea .Lf3(%rip), %rbx - jmp .Lmb4 - -.Lmf5: .byte 0xc4,226,179,0xf6,6 - lea 32(%rsi), %rsi - lea 32(%rdi), %rdi - inc %ecx - lea .Lf4(%rip), %rbx - jmp .Lmb5 - -.Lmf6: .byte 0xc4,98,171,0xf6,6 - lea 40(%rsi), %rsi - lea 40(%rdi), %rdi - inc %ecx - lea .Lf5(%rip), %rbx - jmp .Lmb6 - -.Lmf7: .byte 0xc4,226,179,0xf6,6 - lea 48(%rsi), %rsi - lea 48(%rdi), %rdi - lea .Lf6(%rip), %rbx - jmp .Lmb7 - -.Lmf1: .byte 0xc4,226,179,0xf6,6 - lea .Lf0(%rip), %rbx - jmp .Lmb1 - -.Lmf2: .byte 0xc4,98,171,0xf6,6 - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - lea .Lf1(%rip), %rbx - .byte 0xc4,226,179,0xf6,6 - - - .align 32, 0x90 -.Lmtop:mov %r10, -8(%rdi) - adc %r8, %r9 -.Lmb1: .byte 0xc4,98,171,0xf6,70,8 - adc %rax, %r10 - lea 64(%rsi), %rsi - mov %r9, (%rdi) -.Lmb0: mov %r10, 8(%rdi) - .byte 0xc4,226,179,0xf6,70,208 - lea 64(%rdi), %rdi - adc %r8, %r9 -.Lmb7: .byte 0xc4,98,171,0xf6,70,216 - mov %r9, -48(%rdi) - adc %rax, %r10 -.Lmb6: mov %r10, -40(%rdi) - .byte 0xc4,226,179,0xf6,70,224 - adc %r8, %r9 -.Lmb5: .byte 0xc4,98,171,0xf6,70,232 - mov %r9, -32(%rdi) - adc %rax, %r10 -.Lmb4: .byte 0xc4,226,179,0xf6,70,240 - mov %r10, -24(%rdi) - adc %r8, %r9 -.Lmb3: .byte 0xc4,98,171,0xf6,70,248 + mov %rbp, -8(%rdi,%r9,8) + adc %rdx, %rbx + mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %r15d +.Lmul_2_entry_1: + add $4, %r9 + mov %r10, -32(%rdi,%r9,8) + js .Lmul_2_top + + imul -16(%rsi), %r14 + add %r14, %rbx + imul -8(%rsi), %r13 + add %r13, %rbx + mov %rbx, -8(%rdi) + + add $2, %rcx + jz .Lret + + mov 16(%r11), %r13 + mov 24(%r11), %r14 + + lea 16(%r11), %r11 + lea 16(%rdi), %rdi + + jmp *%r8 + + +.Laddmul_outer_1: + lea -2(%rcx), %r9 + mov -16(%rsi,%rcx,8), %rax + mul %r13 + mov %rax, %r10 + mov -16(%rsi,%rcx,8), %rax + mov %rdx, %rbx + xor %r15d, %r15d + lea .Laddmul_outer_3(%rip), %r8 + jmp .Laddmul_entry_1 + +.Laddmul_outer_3: + lea 0(%rcx), %r9 + mov -16(%rsi,%rcx,8), %rax + xor %r10d, %r10d + mul %r13 + mov %rax, %r15 + mov -16(%rsi,%rcx,8), %rax + mov %rdx, %rbp + lea .Laddmul_outer_1(%rip), %r8 + jmp .Laddmul_entry_3 + + .align 16, 0x90 +.Laddmul_top: + add %r10, -32(%rdi,%r9,8) + adc %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %r15 + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %r15 + adc %ebp, %ebp + mul %r14 + xor %r10d, %r10d + add %rbx, -24(%rdi,%r9,8) + adc %rax, %r15 + mov -16(%rsi,%r9,8), %rax + adc %rdx, %rbp + mul %r13 + add %rax, %r15 + mov -16(%rsi,%r9,8), %rax + adc %rdx, %rbp + adc $0, %r10d +.Laddmul_entry_3: + mul %r14 + add %r15, -16(%rdi,%r9,8) + adc %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mul %r13 + xor %ebx, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %r15d + mov -8(%rsi,%r9,8), %rax + adc %r15d, %ebx + mul %r14 + add %rbp, -8(%rdi,%r9,8) adc %rax, %r10 - mov %r9, -16(%rdi) - dec %ecx - .byte 0xc4,226,179,0xf6,6 - jnz .Lmtop - -.Lmend:mov %r10, -8(%rdi) - adc %r8, %r9 - mov %r9, (%rdi) - adc %rcx, %rax - - lea 8(,%rbp,8), %r12 - neg %r12 - shr $3, %ebp - jmp .Lent - -.Lf0: .byte 0xc4,98,171,0xf6,6 - lea -8(%rsi), %rsi - lea -8(%rdi), %rdi - lea .Lf7(%rip), %rbx - jmp .Lb0 - -.Lf1: .byte 0xc4,226,179,0xf6,6 - lea -1(%rbp), %ebp - lea .Lf0(%rip), %rbx - jmp .Lb1 - -.Lend: .byte 0xf3,76,0x0f,0x38,0xf6,15 - mov %r9, (%rdi) - .byte 0xf3,72,0x0f,0x38,0xf6,193 - adc %rcx, %rax - lea 8(%r12), %r12 -.Lent: .byte 0xc4,98,171,0xf6,70,8 - add %rax, %r14 - add %r10, %r14 - lea (%rsi,%r12), %rsi - lea 8(%rdi,%r12), %rdi - mov (%r11), %rdx - lea 8(%r11), %r11 - or %ebp, %ecx - jmp *%rbx + adc %rdx, %rbx + mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + mov (%rsi,%r9,8), %rax + adc %rdx, %rbx + adc $0, %r15d +.Laddmul_entry_1: + mul %r14 + add $4, %r9 + js .Laddmul_top + + add %r10, -32(%rdi) + adc %rax, %rbx + + imul -24(%rsi), %r13 + add %r13, %rbx + add %rbx, -24(%rdi) + + add $2, %rcx + jns .Lret + + lea 16(%r11), %r11 + + mov (%r11), %r13 + mov 8(%r11), %r14 -.Lf7: .byte 0xc4,226,179,0xf6,6 lea -16(%rsi), %rsi - lea -16(%rdi), %rdi - lea .Lf6(%rip), %rbx - jmp .Lb7 - -.Lf2: .byte 0xc4,98,171,0xf6,6 - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - .byte 0xc4,226,179,0xf6,6 - lea .Lf1(%rip), %rbx - - - .align 32, 0x90 -.Ltop: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, -8(%rdi) - jrcxz .Lend -.Lb1: .byte 0xc4,98,171,0xf6,70,8 - .byte 0xf3,76,0x0f,0x38,0xf6,15 - lea -1(%rcx), %ecx - mov %r9, (%rdi) - .byte 0x66,76,0x0f,0x38,0xf6,208 -.Lb0: .byte 0xc4,226,179,0xf6,70,16 - .byte 0x66,77,0x0f,0x38,0xf6,200 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) -.Lb7: .byte 0xc4,98,171,0xf6,70,24 - lea 64(%rsi), %rsi - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xf3,76,0x0f,0x38,0xf6,79,16 - mov %r9, 16(%rdi) -.Lb6: .byte 0xc4,226,179,0xf6,70,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, 24(%rdi) -.Lb5: .byte 0xc4,98,171,0xf6,70,232 - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xf3,76,0x0f,0x38,0xf6,79,32 - mov %r9, 32(%rdi) -.Lb4: .byte 0xc4,226,179,0xf6,70,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, 40(%rdi) -.Lb3: .byte 0xf3,76,0x0f,0x38,0xf6,79,48 - .byte 0xc4,98,171,0xf6,70,248 - mov %r9, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xc4,226,179,0xf6,6 - jmp .Ltop - -.Lf6: .byte 0xc4,98,171,0xf6,6 - lea 40(%rsi), %rsi - lea -24(%rdi), %rdi - lea .Lf5(%rip), %rbx - jmp .Lb6 - -.Lf5: .byte 0xc4,226,179,0xf6,6 - lea 32(%rsi), %rsi - lea -32(%rdi), %rdi - lea .Lf4(%rip), %rbx - jmp .Lb5 - -.Lf4: .byte 0xc4,98,171,0xf6,6 - lea 24(%rsi), %rsi - lea -40(%rdi), %rdi - lea .Lf3(%rip), %rbx - jmp .Lb4 - -.Lf3: .byte 0xc4,226,179,0xf6,6 - lea 16(%rsi), %rsi - lea -48(%rdi), %rdi - jrcxz .Lcor - lea .Lf2(%rip), %rbx - jmp .Lb3 - -.Lcor: .byte 0xf3,76,0x0f,0x38,0xf6,79,48 - .byte 0xc4,98,171,0xf6,70,248 - mov %r9, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xc4,226,179,0xf6,6 - .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, -8(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,15 - mov %r9, (%rdi) - .byte 0xf3,72,0x0f,0x38,0xf6,193 -.Lc2: - .byte 0xc4,98,171,0xf6,70,8 - adc %rax, %r14 - add %r10, %r14 - mov (%r11), %rdx - test %ecx, %ecx - .byte 0xc4,98,171,0xf6,70,240 - .byte 0xc4,226,179,0xf6,70,248 - .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, -8(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,15 - .byte 0xf3,72,0x0f,0x38,0xf6,193 - adc %rcx, %rax - .byte 0xc4,98,171,0xf6,6 - add %rax, %r14 - add %r10, %r14 - mov 8(%r11), %rdx - .byte 0xc4,226,243,0xf6,70,240 - add %r9, %rcx - mov %rcx, (%rdi) - adc $0, %rax - .byte 0xc4,98,171,0xf6,70,248 - add %rax, %r14 - add %r10, %r14 - mov %r14, 8(%rdi) + + jmp *%r8 + +.Lret: pop %r15 + pop %r14 + pop %r13 pop %rbp pop %rbx - pop %r12 - pop %r14 ret .size __gmpn_mullo_basecase,.-__gmpn_mullo_basecase - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Lmtab:.long .Lmf7-.Lmtab - .long .Lmf0-.Lmtab - .long .Lmf1-.Lmtab - .long .Lmf2-.Lmtab - .long .Lmf3-.Lmtab - .long .Lmf4-.Lmtab - .long .Lmf5-.Lmtab - .long .Lmf6-.Lmtab diff --git a/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s b/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s new file mode 100644 index 0000000000..b607e84aca --- /dev/null +++ b/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s @@ -0,0 +1,573 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_mulmid_basecase + .type __gmpn_mulmid_basecase,@function + +__gmpn_mulmid_basecase: + + + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rcx, %r15 + + + lea 1(%rdx), %r13 + sub %r8, %r13 + + lea (%rdi,%r13,8), %rdi + + cmp $4, %r13 + jc .Ldiagonal + + lea (%rsi,%rdx,8), %rsi + + test $1, %r8 + jz .Lmul_2 + + + + +.Lmul_1: + mov %r13d, %ebx + + neg %r13 + mov (%rsi,%r13,8), %rax + mov (%r15), %r12 + mul %r12 + + and $-4, %r13 + mov %r13, %r11 + + and $3, %ebx + jz .Lmul_1_prologue_0 + cmp $2, %ebx + jc .Lmul_1_prologue_1 + jz .Lmul_1_prologue_2 + +.Lmul_1_prologue_3: + mov %rax, %r10 + mov %rdx, %rbx + lea .Laddmul_prologue_3(%rip), %r14 + jmp .Lmul_1_entry_3 + + .align 16, 0x90 +.Lmul_1_prologue_0: + mov %rax, %rbp + mov %rdx, %r10 + lea .Laddmul_prologue_0(%rip), %r14 + jmp .Lmul_1_entry_0 + + .align 16, 0x90 +.Lmul_1_prologue_1: + add $4, %r11 + mov %rax, %rcx + mov %rdx, %rbp + mov $0, %r10d + mov (%rsi,%r11,8), %rax + lea .Laddmul_prologue_1(%rip), %r14 + jmp .Lmul_1_entry_1 + + .align 16, 0x90 +.Lmul_1_prologue_2: + mov %rax, %rbx + mov %rdx, %rcx + mov 24(%rsi,%r11,8), %rax + mov $0, %ebp + mov $0, %r10d + lea .Laddmul_prologue_2(%rip), %r14 + jmp .Lmul_1_entry_2 + + + + + .align 16, 0x90 +.Lmul_1_top: + mov %rbx, -16(%rdi,%r11,8) + add %rax, %rcx + mov (%rsi,%r11,8), %rax + adc %rdx, %rbp +.Lmul_1_entry_1: + mov $0, %ebx + mul %r12 + mov %rcx, -8(%rdi,%r11,8) + add %rax, %rbp + adc %rdx, %r10 +.Lmul_1_entry_0: + mov 8(%rsi,%r11,8), %rax + mul %r12 + mov %rbp, (%rdi,%r11,8) + add %rax, %r10 + adc %rdx, %rbx +.Lmul_1_entry_3: + mov 16(%rsi,%r11,8), %rax + mul %r12 + mov %r10, 8(%rdi,%r11,8) + mov $0, %ebp + mov %rbp, %r10 + add %rax, %rbx + mov 24(%rsi,%r11,8), %rax + mov %rbp, %rcx + adc %rdx, %rcx +.Lmul_1_entry_2: + mul %r12 + add $4, %r11 + js .Lmul_1_top + + mov %rbx, -16(%rdi) + add %rax, %rcx + mov %rcx, -8(%rdi) + mov %rbp, 8(%rdi) + adc %rdx, %rbp + mov %rbp, (%rdi) + + dec %r8 + jz .Lret + + lea -8(%rsi), %rsi + lea 8(%r15), %r15 + + mov %r13, %r11 + mov (%r15), %r12 + mov 8(%r15), %r9 + + jmp *%r14 + + + + + .align 16, 0x90 +.Lmul_2: + mov %r13d, %ebx + + neg %r13 + mov -8(%rsi,%r13,8), %rax + mov (%r15), %r12 + mov 8(%r15), %r9 + mul %r9 + + and $-4, %r13 + mov %r13, %r11 + + and $3, %ebx + jz .Lmul_2_prologue_0 + cmp $2, %ebx + jc .Lmul_2_prologue_1 + jz .Lmul_2_prologue_2 + +.Lmul_2_prologue_3: + mov %rax, %rcx + mov %rdx, %rbp + lea .Laddmul_prologue_3(%rip), %r14 + jmp .Lmul_2_entry_3 + + .align 16, 0x90 +.Lmul_2_prologue_0: + mov %rax, %rbx + mov %rdx, %rcx + lea .Laddmul_prologue_0(%rip), %r14 + jmp .Lmul_2_entry_0 + + .align 16, 0x90 +.Lmul_2_prologue_1: + mov %rax, %r10 + mov %rdx, %rbx + mov $0, %ecx + lea .Laddmul_prologue_1(%rip), %r14 + jmp .Lmul_2_entry_1 + + .align 16, 0x90 +.Lmul_2_prologue_2: + mov %rax, %rbp + mov %rdx, %r10 + mov $0, %ebx + mov 16(%rsi,%r11,8), %rax + lea .Laddmul_prologue_2(%rip), %r14 + jmp .Lmul_2_entry_2 + + + + + .align 16, 0x90 +.Lmul_2_top: + mov -8(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rbx + adc %rdx, %rcx +.Lmul_2_entry_0: + mov $0, %ebp + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %rbx + mov (%rsi,%r11,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r9 + add %rax, %rcx + mov %rbx, (%rdi,%r11,8) + adc %rdx, %rbp +.Lmul_2_entry_3: + mov 8(%rsi,%r11,8), %rax + mul %r12 + mov $0, %r10d + add %rax, %rcx + adc %rdx, %rbp + mov $0, %ebx + adc $0, %r10d + mov 8(%rsi,%r11,8), %rax + mov %rcx, 8(%rdi,%r11,8) + mul %r9 + add %rax, %rbp + mov 16(%rsi,%r11,8), %rax + adc %rdx, %r10 +.Lmul_2_entry_2: + mov $0, %ecx + mul %r12 + add %rax, %rbp + mov 16(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r9 + add %rax, %r10 + mov %rbp, 16(%rdi,%r11,8) + adc %rdx, %rbx +.Lmul_2_entry_1: + mov 24(%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + add $4, %r11 + mov %r10, -8(%rdi,%r11,8) + jnz .Lmul_2_top + + mov %rbx, (%rdi) + mov %rcx, 8(%rdi) + + sub $2, %r8 + jz .Lret + + lea 16(%r15), %r15 + lea -16(%rsi), %rsi + + mov %r13, %r11 + mov (%r15), %r12 + mov 8(%r15), %r9 + + jmp *%r14 + + + + + .align 16, 0x90 +.Laddmul_prologue_0: + mov -8(%rsi,%r11,8), %rax + mul %r9 + mov %rax, %rcx + mov %rdx, %rbp + mov $0, %r10d + jmp .Laddmul_entry_0 + + .align 16, 0x90 +.Laddmul_prologue_1: + mov 16(%rsi,%r11,8), %rax + mul %r9 + mov %rax, %rbx + mov %rdx, %rcx + mov $0, %ebp + mov 24(%rsi,%r11,8), %rax + jmp .Laddmul_entry_1 + + .align 16, 0x90 +.Laddmul_prologue_2: + mov 8(%rsi,%r11,8), %rax + mul %r9 + mov %rax, %r10 + mov %rdx, %rbx + mov $0, %ecx + jmp .Laddmul_entry_2 + + .align 16, 0x90 +.Laddmul_prologue_3: + mov (%rsi,%r11,8), %rax + mul %r9 + mov %rax, %rbp + mov %rdx, %r10 + mov $0, %ebx + mov $0, %ecx + jmp .Laddmul_entry_3 + + + + .align 16, 0x90 +.Laddmul_top: + mov $0, %r10d + add %rax, %rbx + mov -8(%rsi,%r11,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r9 + add %rbx, -8(%rdi,%r11,8) + adc %rax, %rcx + adc %rdx, %rbp +.Laddmul_entry_0: + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %rcx + mov (%rsi,%r11,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r9 + add %rcx, (%rdi,%r11,8) + mov $0, %ecx + adc %rax, %rbp + mov $0, %ebx + adc %rdx, %r10 +.Laddmul_entry_3: + mov 8(%rsi,%r11,8), %rax + mul %r12 + add %rax, %rbp + mov 8(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r9 + add %rbp, 8(%rdi,%r11,8) + adc %rax, %r10 + adc %rdx, %rbx +.Laddmul_entry_2: + mov 16(%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + mov 16(%rsi,%r11,8), %rax + adc %rdx, %rbx + adc $0, %ecx + mul %r9 + add %r10, 16(%rdi,%r11,8) + nop + adc %rax, %rbx + mov $0, %ebp + mov 24(%rsi,%r11,8), %rax + adc %rdx, %rcx +.Laddmul_entry_1: + mul %r12 + add $4, %r11 + jnz .Laddmul_top + + add %rax, %rbx + adc %rdx, %rcx + adc $0, %ebp + + add %rbx, -8(%rdi) + adc %rcx, (%rdi) + adc %rbp, 8(%rdi) + + sub $2, %r8 + jz .Lret + + lea 16(%r15), %r15 + lea -16(%rsi), %rsi + + mov %r13, %r11 + mov (%r15), %r12 + mov 8(%r15), %r9 + + jmp *%r14 + + + + + .align 16, 0x90 +.Ldiagonal: + xor %ebx, %ebx + xor %ecx, %ecx + xor %ebp, %ebp + + neg %r13 + + mov %r8d, %eax + and $3, %eax + jz .Ldiag_prologue_0 + cmp $2, %eax + jc .Ldiag_prologue_1 + jz .Ldiag_prologue_2 + +.Ldiag_prologue_3: + lea -8(%r15), %r15 + mov %r15, %r10 + add $1, %r8 + mov %r8, %r11 + lea .Ldiag_entry_3(%rip), %r14 + jmp .Ldiag_entry_3 + +.Ldiag_prologue_0: + mov %r15, %r10 + mov %r8, %r11 + lea 0(%rip), %r14 + mov -8(%rsi,%r11,8), %rax + jmp .Ldiag_entry_0 + +.Ldiag_prologue_1: + lea 8(%r15), %r15 + mov %r15, %r10 + add $3, %r8 + mov %r8, %r11 + lea 0(%rip), %r14 + mov -8(%r10), %rax + jmp .Ldiag_entry_1 + +.Ldiag_prologue_2: + lea -16(%r15), %r15 + mov %r15, %r10 + add $2, %r8 + mov %r8, %r11 + lea 0(%rip), %r14 + mov 16(%r10), %rax + jmp .Ldiag_entry_2 + + + + + .align 16, 0x90 +.Ldiag_top: + add %rax, %rbx + adc %rdx, %rcx + mov -8(%rsi,%r11,8), %rax + adc $0, %rbp +.Ldiag_entry_0: + mulq (%r10) + add %rax, %rbx + adc %rdx, %rcx + adc $0, %rbp +.Ldiag_entry_3: + mov -16(%rsi,%r11,8), %rax + mulq 8(%r10) + add %rax, %rbx + mov 16(%r10), %rax + adc %rdx, %rcx + adc $0, %rbp +.Ldiag_entry_2: + mulq -24(%rsi,%r11,8) + add %rax, %rbx + mov 24(%r10), %rax + adc %rdx, %rcx + lea 32(%r10), %r10 + adc $0, %rbp +.Ldiag_entry_1: + mulq -32(%rsi,%r11,8) + sub $4, %r11 + jnz .Ldiag_top + + add %rax, %rbx + adc %rdx, %rcx + adc $0, %rbp + + mov %rbx, (%rdi,%r13,8) + + inc %r13 + jz .Ldiag_end + + mov %r8, %r11 + mov %r15, %r10 + + lea 8(%rsi), %rsi + mov %rcx, %rbx + mov %rbp, %rcx + xor %ebp, %ebp + + jmp *%r14 + +.Ldiag_end: + mov %rcx, (%rdi) + mov %rbp, 8(%rdi) + +.Lret: pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + .size __gmpn_mulmid_basecase,.-__gmpn_mulmid_basecase diff --git a/ext/gmp/gen/x86_64-linux/mpn/nand_n.s b/ext/gmp/gen/x86_64-linux/mpn/nand_n.s index ad4e827623..04593b9b51 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/nand_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/nand_n.s @@ -94,7 +94,6 @@ - .text @@ -107,56 +106,48 @@ __gmpn_nand_n: mov (%rdx), %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: and (%rsi), %r8 +.Lb11: and (%rsi,%rcx,8), %r8 not %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: and (%rsi), %r8 + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: and (%rsi,%rcx,8), %r8 not %r8 - mov %r8, (%rdi) - dec %rcx + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 -.Lb00: mov 8(%rdx), %r9 - and (%rsi), %r8 + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + and (%rsi,%rcx,8), %r8 not %r8 - and 8(%rsi), %r9 + and 8(%rsi,%rcx,8), %r9 not %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 -.Le10: mov 24(%rdx), %r9 - lea 32(%rdx), %rdx - and 16(%rsi), %r8 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + and 16(%rsi,%rcx,8), %r8 not %r8 - and 24(%rsi), %r9 - lea 32(%rsi), %rsi + and 24(%rsi,%rcx,8), %r9 not %r9 - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/nior_n.s b/ext/gmp/gen/x86_64-linux/mpn/nior_n.s index 68dffa7222..8ea0437f09 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/nior_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/nior_n.s @@ -94,7 +94,6 @@ - .text @@ -107,56 +106,48 @@ __gmpn_nior_n: mov (%rdx), %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: or (%rsi), %r8 +.Lb11: or (%rsi,%rcx,8), %r8 not %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: or (%rsi), %r8 + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: or (%rsi,%rcx,8), %r8 not %r8 - mov %r8, (%rdi) - dec %rcx + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 -.Lb00: mov 8(%rdx), %r9 - or (%rsi), %r8 + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + or (%rsi,%rcx,8), %r8 not %r8 - or 8(%rsi), %r9 + or 8(%rsi,%rcx,8), %r9 not %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 -.Le10: mov 24(%rdx), %r9 - lea 32(%rdx), %rdx - or 16(%rsi), %r8 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + or 16(%rsi,%rcx,8), %r8 not %r8 - or 24(%rsi), %r9 - lea 32(%rsi), %rsi + or 24(%rsi,%rcx,8), %r9 not %r9 - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/popcount.s b/ext/gmp/gen/x86_64-linux/mpn/popcount.s index d118f5bda4..243219e87c 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/popcount.s +++ b/ext/gmp/gen/x86_64-linux/mpn/popcount.s @@ -59,16 +59,15 @@ - - - - - - - - - - + + + + + + + + + @@ -86,110 +85,76 @@ __gmpn_popcount: - - - mov %esi, %r8d - and $7, %r8d - - .byte 0xf3,0x48,0x0f,0xb8,0x07 - xor %ecx, %ecx - - lea .Ltab(%rip), %r9 - - movslq (%r9,%r8,4), %r8 - add %r9, %r8 - jmp *%r8 - - -.L3: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 - add $24, %rdi - sub $8, %rsi - jg .Le34 - add %r10, %rax - add %r11, %rax -.Ls1: - ret - -.L1: sub $8, %rsi - jle .Ls1 - .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 - add $8, %rdi - jmp .Le12 - -.L7: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 - add $-8, %rdi - jmp .Le07 - -.L0: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 - jmp .Le07 - -.L4: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 - add $32, %rdi - sub $8, %rsi - jle .Lx4 + + push %rbx + mov $0x5555555555555555, %r10 + push %rbp + mov $0x3333333333333333, %r11 + lea (%rdi,%rsi,8), %rdi + mov $0x0f0f0f0f0f0f0f0f, %rcx + neg %rsi + mov $0x0101010101010101, %rdx + xor %eax, %eax + test $1, %sil + jz .Ltop + + mov (%rdi,%rsi,8), %r8 + + mov %r8, %r9 + shr %r8 + and %r10, %r8 + sub %r8, %r9 + + mov %r9, %r8 + shr $2, %r9 + and %r11, %r8 + and %r11, %r9 + add %r8, %r9 + + dec %rsi + jmp .Lmid .align 16, 0x90 -.Ltop: -.Le34: .byte 0xf3,0x4c,0x0f,0xb8,0x07 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x08 - add %r10, %rcx - add %r11, %rax -.Le12: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 - add %r8, %rcx - add %r9, %rax -.Le07: .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x20 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x28 - add %r10, %rcx - add %r11, %rax -.Le56: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x30 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x38 - add $64, %rdi - add %r8, %rcx - add %r9, %rax - sub $8, %rsi - jg .Ltop - -.Lx4: add %r10, %rcx - add %r11, %rax -.Lx2: add %rcx, %rax - +.Ltop: mov (%rdi,%rsi,8), %r8 + mov 8(%rdi,%rsi,8), %rbx + + mov %r8, %r9 + mov %rbx, %rbp + shr %r8 + shr %rbx + and %r10, %r8 + and %r10, %rbx + sub %r8, %r9 + sub %rbx, %rbp + + mov %r9, %r8 + mov %rbp, %rbx + shr $2, %r9 + shr $2, %rbp + and %r11, %r8 + and %r11, %r9 + and %r11, %rbx + and %r11, %rbp + add %r8, %r9 + add %rbx, %rbp + + add %rbp, %r9 +.Lmid: mov %r9, %r8 + shr $4, %r9 + and %rcx, %r8 + and %rcx, %r9 + add %r8, %r9 + + imul %rdx, %r9 + shr $56, %r9 + + add %r9, %rax + add $2, %rsi + jnc .Ltop + +.Lend: + pop %rbp + pop %rbx ret - -.L2: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 - sub $8, %rsi - jle .Lx2 - .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 - add $16, %rdi - jmp .Le12 - -.L5: .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 - add $-24, %rdi - jmp .Le56 - -.L6: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 - add $-16, %rdi - jmp .Le56 .size __gmpn_popcount,.-__gmpn_popcount - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab - .long .L4-.Ltab - .long .L5-.Ltab - .long .L6-.Ltab - .long .L7-.Ltab diff --git a/ext/gmp/gen/x86_64-linux/mpn/redc_1.s b/ext/gmp/gen/x86_64-linux/mpn/redc_1.s index a5912b7b6d..da7fd88758 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/redc_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/redc_1.s @@ -63,6 +63,11 @@ + + + + + @@ -77,14 +82,15 @@ - - + + + .text - .align 16, 0x90 + .align 32, 0x90 .globl __gmpn_redc_1 .type __gmpn_redc_1,@function @@ -92,356 +98,506 @@ __gmpn_redc_1: - push %rbx push %rbp + mov (%rsi), %rbp + push %rbx + imul %r8, %rbp push %r12 push %r13 push %r14 push %r15 - push %rdi - mov %rdx, %rdi - mov (%rsi), %rdx - - neg %rcx - push %r8 - imul %r8, %rdx - mov %rcx, %r15 - - test $1, %cl - jnz .Lbx1 - -.Lbx0: test $2, %cl - jz .Lo0b - - cmp $-2, %ecx - jnz .Lo2 - - - mov 8(%rsp), %rbx - lea 16(%rsp), %rsp - .byte 0xc4,98,179,0xf6,39 - .byte 0xc4,98,163,0xf6,87,8 - add %r12, %r11 - adc $0, %r10 - add (%rsi), %r9 - adc 8(%rsi), %r11 - adc $0, %r10 - mov %r11, %rdx - imul %r8, %rdx - .byte 0xc4,98,147,0xf6,39 - .byte 0xc4,98,139,0xf6,127,8 - xor %eax, %eax - add %r12, %r14 - adc $0, %r15 - add %r11, %r13 - adc 16(%rsi), %r14 - adc $0, %r15 - add %r14, %r10 - adc 24(%rsi), %r15 - mov %r10, (%rbx) - mov %r15, 8(%rbx) - setc %al - jmp .Lret -.Lo2: lea 2(%rcx), %r14 - .byte 0xc4,98,179,0xf6,7 - .byte 0xc4,98,163,0xf6,87,8 - sar $2, %r14 - add %r8, %r11 - jmp .Llo2 + mov %rcx, %r12 + neg %r12 + lea (%rdx,%rcx,8), %r13 + lea -16(%rsi,%rcx,8), %rsi + + mov %ecx, %eax + and $3, %eax + lea 4(%rax), %r9 + cmp $4, %ecx + cmovg %r9, %rax + lea .Ltab(%rip), %r9 + + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax + + + .section .data.rel.ro.local,"a",@progbits + .align 8, 0x90 +.Ltab: .long .L0-.Ltab + .long .L1-.Ltab + .long .L2-.Ltab + .long .L3-.Ltab + .long .L0m4-.Ltab + .long .L1m4-.Ltab + .long .L2m4-.Ltab + .long .L3m4-.Ltab + .text .align 16, 0x90 -.Ltp2: adc %rax, %r9 - lea 32(%rsi), %rsi - adc %r8, %r11 -.Llo2: .byte 0xc4,98,147,0xf6,103,16 - mov (%rsi), %r8 - .byte 0xc4,226,227,0xf6,71,24 - lea 32(%rdi), %rdi - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov 8(%rsi), %r10 - mov 16(%rsi), %r12 - add %r9, %r8 - mov 24(%rsi), %rbp - mov %r8, (%rsi) - adc %r11, %r10 - .byte 0xc4,98,179,0xf6,7 - mov %r10, 8(%rsi) - adc %r13, %r12 - mov %r12, 16(%rsi) - adc %rbx, %rbp - .byte 0xc4,98,163,0xf6,87,8 - mov %rbp, 24(%rsi) - inc %r14 - jnz .Ltp2 - -.Led2: mov 56(%rsi,%rcx,8), %rdx - lea 16(%rdi,%rcx,8), %rdi - adc %rax, %r9 - adc %r8, %r11 - mov 32(%rsi), %r8 - adc $0, %r10 - imul (%rsp), %rdx - mov 40(%rsi), %rax - add %r9, %r8 - mov %r8, 32(%rsi) - adc %r11, %rax - mov %rax, 40(%rsi) - lea 56(%rsi,%rcx,8), %rsi - adc $0, %r10 - mov %r10, -8(%rsi) - inc %r15 - jnz .Lo2 - - jmp .Lcj - - -.Lbx1: test $2, %cl - jz .Lo3a - -.Lo1a: cmp $-1, %ecx - jnz .Lo1b - - - mov 8(%rsp), %rbx - lea 16(%rsp), %rsp - .byte 0xc4,98,163,0xf6,23 - add (%rsi), %r11 - adc 8(%rsi), %r10 - mov %r10, (%rbx) +.L1: mov (%rdx), %rax + mul %rbp + add 8(%rsi), %rax + adc 16(%rsi), %rdx + mov %rdx, (%rdi) mov $0, %eax - setc %al + adc %eax, %eax jmp .Lret -.Lo1b: lea 24(%rdi), %rdi -.Lo1: lea 1(%rcx), %r14 - .byte 0xc4,98,163,0xf6,87,232 - .byte 0xc4,98,147,0xf6,103,240 - .byte 0xc4,226,227,0xf6,71,248 - sar $2, %r14 - add %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov (%rsi), %r10 - mov 8(%rsi), %r12 - mov 16(%rsi), %rbp - add %r11, %r10 - jmp .Llo1 .align 16, 0x90 -.Ltp1: adc %rax, %r9 - lea 32(%rsi), %rsi - adc %r8, %r11 - .byte 0xc4,98,147,0xf6,103,16 - mov -8(%rsi), %r8 - .byte 0xc4,226,227,0xf6,71,24 - lea 32(%rdi), %rdi - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov (%rsi), %r10 - mov 8(%rsi), %r12 - add %r9, %r8 - mov 16(%rsi), %rbp - mov %r8, -8(%rsi) - adc %r11, %r10 -.Llo1: .byte 0xc4,98,179,0xf6,7 +.L2: mov (%rdx), %rax + mul %rbp + xor %r14d, %r14d + mov %rax, %r10 + mov -8(%r13), %rax + mov %rdx, %r9 + mul %rbp + add (%rsi), %r10 + adc %rax, %r9 + adc %rdx, %r14 + add 8(%rsi), %r9 + adc $0, %r14 + mov %r9, %rbp + imul %r8, %rbp + mov -16(%r13), %rax + mul %rbp + xor %ebx, %ebx + mov %rax, %r10 + mov -8(%r13), %rax + mov %rdx, %r11 + mul %rbp + add %r9, %r10 + adc %rax, %r11 + adc %rdx, %rbx + add 16(%rsi), %r11 + adc $0, %rbx + xor %eax, %eax + add %r11, %r14 + adc 24(%rsi), %rbx + mov %r14, (%rdi) + mov %rbx, 8(%rdi) + adc %eax, %eax + jmp .Lret + + +.L3: mov (%rdx), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov -16(%r13), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d + add -8(%rsi), %rbx + adc %rax, %r10 + mov -8(%r13), %rax + adc %rdx, %r9 + mul %rbp + add (%rsi), %r10 mov %r10, (%rsi) - adc %r13, %r12 - mov %r12, 8(%rsi) - adc %rbx, %rbp - .byte 0xc4,98,163,0xf6,87,8 - mov %rbp, 16(%rsi) - inc %r14 - jnz .Ltp1 - -.Led1: mov 48(%rsi,%rcx,8), %rdx - lea 40(%rdi,%rcx,8), %rdi adc %rax, %r9 - adc %r8, %r11 - mov 24(%rsi), %r8 - adc $0, %r10 - imul (%rsp), %rdx - mov 32(%rsi), %rax - add %r9, %r8 - mov %r8, 24(%rsi) - adc %r11, %rax - mov %rax, 32(%rsi) - lea 48(%rsi,%rcx,8), %rsi - adc $0, %r10 - mov %r10, -8(%rsi) - inc %r15 - jnz .Lo1 - - jmp .Lcj - -.Lo3a: cmp $-3, %ecx - jnz .Lo3b - - -.Ln3: .byte 0xc4,226,227,0xf6,7 - .byte 0xc4,98,179,0xf6,119,8 + adc %rdx, %r14 + mov %r10, %rbp + imul %r8, %rbp + add %r9, 8(%rsi) + adc $0, %r14 + mov %r14, -8(%rsi) + + mov -24(%r13), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov -16(%r13), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d add (%rsi), %rbx - .byte 0xc4,98,163,0xf6,87,16 - adc %rax, %r9 - adc %r14, %r11 - mov 8(%rsi), %r14 - mov %r8, %rdx - adc $0, %r10 - mov 16(%rsi), %rax - add %r9, %r14 - mov %r14, 8(%rsi) - .byte 0xc4,66,235,0xf6,238 - adc %r11, %rax - mov %rax, 16(%rsi) - adc $0, %r10 - mov %r10, (%rsi) - lea 8(%rsi), %rsi - inc %r15 - jnz .Ln3 - - jmp .Lcj - -.Lo3b: lea 8(%rdi), %rdi -.Lo3: lea 4(%rcx), %r14 - .byte 0xc4,226,227,0xf6,71,248 - .byte 0xc4,98,179,0xf6,7 - mov (%rsi), %rbp - .byte 0xc4,98,163,0xf6,87,8 - sar $2, %r14 - add %rbx, %rbp - nop + adc %rax, %r10 + mov -8(%r13), %rax + adc %rdx, %r9 + mul %rbp + add 8(%rsi), %r10 + mov %r10, 8(%rsi) + adc %rax, %r9 + adc %rdx, %r14 + mov %r10, %rbp + imul %r8, %rbp + add %r9, 16(%rsi) + adc $0, %r14 + mov %r14, (%rsi) + + mov -24(%r13), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov -16(%r13), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d + add 8(%rsi), %rbx + adc %rax, %r10 + mov -8(%r13), %rax + adc %rdx, %r9 + mul %rbp + add 16(%rsi), %r10 adc %rax, %r9 - jmp .Llo3 + adc %rdx, %r14 + add 24(%rsi), %r9 + adc $0, %r14 + + xor %eax, %eax + add -8(%rsi), %r10 + adc (%rsi), %r9 + adc 32(%rsi), %r14 + mov %r10, (%rdi) + mov %r9, 8(%rdi) + mov %r14, 16(%rdi) + adc %eax, %eax + jmp .Lret + .align 16, 0x90 -.Ltp3: adc %rax, %r9 - lea 32(%rsi), %rsi -.Llo3: adc %r8, %r11 - .byte 0xc4,98,147,0xf6,103,16 - mov 8(%rsi), %r8 - .byte 0xc4,226,227,0xf6,71,24 - lea 32(%rdi), %rdi - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov 16(%rsi), %r10 - mov 24(%rsi), %r12 - add %r9, %r8 - mov 32(%rsi), %rbp - mov %r8, 8(%rsi) - adc %r11, %r10 - .byte 0xc4,98,179,0xf6,7 - mov %r10, 16(%rsi) - adc %r13, %r12 - mov %r12, 24(%rsi) - adc %rbx, %rbp - .byte 0xc4,98,163,0xf6,87,8 - mov %rbp, 32(%rsi) - inc %r14 - jnz .Ltp3 - -.Led3: mov 64(%rsi,%rcx,8), %rdx - lea 24(%rdi,%rcx,8), %rdi +.L2m4: +.Llo2: mov (%r13,%r12,8), %rax + mul %rbp + xor %r14d, %r14d + xor %ebx, %ebx + mov %rax, %r10 + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mov %rdx, %r9 + mul %rbp + add 16(%rsi,%r12,8), %r10 adc %rax, %r9 - adc %r8, %r11 - mov 40(%rsi), %r8 - adc $0, %r10 - imul (%rsp), %rdx - mov 48(%rsi), %rax - add %r9, %r8 - mov %r8, 40(%rsi) - adc %r11, %rax - mov %rax, 48(%rsi) - lea 64(%rsi,%rcx,8), %rsi - adc $0, %r10 - mov %r10, -8(%rsi) - inc %r15 - jnz .Lo3 - - jmp .Lcj - -.Lo0b: lea 16(%rdi), %rdi -.Lo0: mov %rcx, %r14 - .byte 0xc4,98,147,0xf6,103,240 - .byte 0xc4,226,227,0xf6,71,248 - sar $2, %r14 - add %r12, %rbx - adc $0, %rax - mov (%rsi), %r12 - mov 8(%rsi), %rbp - .byte 0xc4,98,179,0xf6,7 - add %r13, %r12 - jmp .Llo0 + mov 16(%r13,%r12,8), %rax + adc %rdx, %r14 + mul %rbp + mov $0, %r10d + lea 2(%r12), %r11 + add %r9, %r15 + imul %r8, %r15 + jmp .Le2 .align 16, 0x90 -.Ltp0: adc %rax, %r9 - lea 32(%rsi), %rsi - adc %r8, %r11 - .byte 0xc4,98,147,0xf6,103,16 +.Lli2: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp +.Le2: add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp + add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli2 + +.Lle2: add %r10, (%rsi) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + add $8, %rsi + mov %r15, %rbp + dec %rcx + jnz .Llo2 + + mov %r12, %rcx + sar $2, %rcx + lea 32(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx + mov -16(%rsi), %r8 - .byte 0xc4,226,227,0xf6,71,24 - lea 32(%rdi), %rdi - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov -8(%rsi), %r10 - mov (%rsi), %r12 - add %r9, %r8 - mov 8(%rsi), %rbp - mov %r8, -16(%rsi) - adc %r11, %r10 - .byte 0xc4,98,179,0xf6,7 - mov %r10, -8(%rsi) - adc %r13, %r12 - mov %r12, (%rsi) -.Llo0: adc %rbx, %rbp - .byte 0xc4,98,163,0xf6,87,8 - mov %rbp, 8(%rsi) - inc %r14 - jnz .Ltp0 - -.Led0: mov 40(%rsi,%rcx,8), %rdx - lea 32(%rdi,%rcx,8), %rdi + mov -8(%rsi), %r9 + add -16(%rdx), %r8 + adc -8(%rdx), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + lea 16(%rdi), %rdi + jmp .Laddx + + + .align 16, 0x90 +.L1m4: +.Llo1: mov (%r13,%r12,8), %rax + xor %r9, %r9 + xor %ebx, %ebx + mul %rbp + mov %rax, %r9 + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mov %rdx, %r14 + mov $0, %r10d + mul %rbp + add 16(%rsi,%r12,8), %r9 + adc %rax, %r14 + adc %rdx, %rbx + mov 16(%r13,%r12,8), %rax + mul %rbp + lea 1(%r12), %r11 + add %r14, %r15 + imul %r8, %r15 + jmp .Le1 + + .align 16, 0x90 +.Lli1: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp + add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp +.Le1: add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli1 + +.Lle1: add %r10, (%rsi) adc %rax, %r9 - adc %r8, %r11 - mov 16(%rsi), %r8 - adc $0, %r10 - imul (%rsp), %rdx - mov 24(%rsi), %rax - add %r9, %r8 - mov %r8, 16(%rsi) - adc %r11, %rax - mov %rax, 24(%rsi) - lea 40(%rsi,%rcx,8), %rsi - adc $0, %r10 - mov %r10, -8(%rsi) - inc %r15 - jnz .Lo0 - -.Lcj: - mov 8(%rsp), %rdi - lea 16-8(%rsp), %rsp - lea (%rsi,%rcx,8), %rdx - neg %ecx + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + add $8, %rsi + mov %r15, %rbp + dec %rcx + jnz .Llo1 + + mov %r12, %rcx + sar $2, %rcx + lea 24(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx - + mov -8(%rsi), %r8 + add -8(%rdx), %r8 + mov %r8, (%rdi) + lea 8(%rdi), %rdi + jmp .Laddx - - call __gmpn_add_n@PLT + + .align 16, 0x90 +.L0: +.L0m4: +.Llo0: mov (%r13,%r12,8), %rax + mov %r12, %r11 + mul %rbp + xor %r10d, %r10d + mov %rax, %r14 + mov %rdx, %rbx + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mul %rbp + add 16(%rsi,%r12,8), %r14 + adc %rax, %rbx + adc %rdx, %r10 + add %rbx, %r15 + imul %r8, %r15 + jmp .Le0 + + .align 16, 0x90 +.Lli0: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp + add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp + add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 +.Le0: mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli0 + +.Lle0: add %r10, (%rsi) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + add $8, %rsi + mov %r15, %rbp + dec %rcx + jnz .Llo0 + + mov %r12, %rcx + sar $2, %rcx + clc + lea 16(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx + jmp .Laddy + + + .align 16, 0x90 +.L3m4: +.Llo3: mov (%r13,%r12,8), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mul %rbp + add 16(%rsi,%r12,8), %rbx + mov $0, %ebx + mov %rbx, %r14 + adc %rax, %r10 + mov 16(%r13,%r12,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + add %r10, %r15 + mul %rbp + lea 3(%r12), %r11 + imul %r8, %r15 + + + .align 16, 0x90 +.Lli3: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp + add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp + add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli3 + +.Lle3: add %r10, (%rsi) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + mov %r15, %rbp + lea 8(%rsi), %rsi + dec %rcx + jnz .Llo3 - lea 8(%rsp), %rsp + mov %r12, %rcx + sar $2, %rcx + lea 40(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx + + mov -24(%rsi), %r8 + mov -16(%rsi), %r9 + mov -8(%rsi), %r10 + add -24(%rdx), %r8 + adc -16(%rdx), %r9 + adc -8(%rdx), %r10 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + lea 24(%rdi), %rdi + +.Laddx:inc %rcx + jz .Lad3 + +.Laddy:mov (%rsi), %r8 + mov 8(%rsi), %r9 + inc %rcx + jmp .Lmid + + +.Lal3: adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + mov %r8, (%rdi) + lea 32(%rsi), %rsi + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + inc %rcx + mov %r11, 24(%rdi) + lea 32(%rdx), %rdx + mov (%rsi), %r8 + mov 8(%rsi), %r9 + lea 32(%rdi), %rdi +.Lmid: mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + jnz .Lal3 + +.Lae3: adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + mov %r11, 24(%rdi) + +.Lad3: mov %ecx, %eax + adc %eax, %eax .Lret: pop %r15 pop %r14 pop %r13 pop %r12 - pop %rbp pop %rbx + pop %rbp ret .size __gmpn_redc_1,.-__gmpn_redc_1 diff --git a/ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s b/ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s index 7eebcc0aff..ac1323b3c6 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s @@ -47,15 +47,6 @@ - - - - - - - - - @@ -78,6 +69,9 @@ + + + @@ -86,29 +80,6 @@ .text - .align 16, 0x90 - .globl __gmpn_rsblsh1_nc - .type __gmpn_rsblsh1_nc,@function - -__gmpn_rsblsh1_nc: - - - - push %rbp - mov %r8, %rax - neg %rax - xor %ebp, %ebp - mov (%rdx), %r8 - shrd $63, %r8, %rbp - mov %ecx, %r9d - and $3, %r9d - je .Lb00 - cmp $2, %r9d - jc .Lb01 - je .Lb10 - jmp .Lb11 - .size __gmpn_rsblsh1_nc,.-__gmpn_rsblsh1_nc - .align 16, 0x90 .globl __gmpn_rsblsh1_n .type __gmpn_rsblsh1_n,@function @@ -117,96 +88,92 @@ __gmpn_rsblsh1_n: push %rbp - xor %ebp, %ebp + mov (%rdx), %r8 - shrd $63, %r8, %rbp mov %ecx, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + xor %ebp, %ebp and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - mov 16(%rdx), %r10 - shrd $63, %r10, %r9 - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - sbb 16(%rsi), %r9 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, %rbp - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi +.Lb11: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 sbb %eax, %eax - sub $3, %rcx - ja .Ltop - jmp .Lend - -.Lb01: add %eax, %eax - sbb (%rsi), %rbp - mov %rbp, (%rdi) - mov %r8, %rbp - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi + sub (%rsi,%rcx,8), %r8 + sbb 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb 16(%rsi,%rcx,8), %r10 + mov %r10, 16(%rdi,%rcx,8) + sbb %ebp, %ebp + add $3, %rcx + jmp .Lent + +.Lb10: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 sbb %eax, %eax - sub $1, %rcx - ja .Ltop - jmp .Lend - -.Lb10: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, %rbp - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi + sub (%rsi,%rcx,8), %r8 + sbb 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb %ebp, %ebp + add $2, %rcx + jmp .Lent + +.Lb01: add %r8, %r8 sbb %eax, %eax - sub $2, %rcx - ja .Ltop - jmp .Lend + sub (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + sbb %ebp, %ebp + inc %rcx +.Lent: jns .Lend .align 16, 0x90 -.Ltop: mov (%rdx), %r8 - shrd $63, %r8, %rbp -.Lb00: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - mov 16(%rdx), %r10 - shrd $63, %r10, %r9 - mov 24(%rdx), %r11 - shrd $63, %r11, %r10 - lea 32(%rdx), %rdx - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - sbb 16(%rsi), %r9 - sbb 24(%rsi), %r10 - lea 32(%rsi), %rsi - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, 24(%rdi) - mov %r11, %rbp - lea 32(%rdi), %rdi +.Ltop: add %eax, %eax + + mov (%rdx,%rcx,8), %r8 +.Lb00: adc %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 + mov 24(%rdx,%rcx,8), %r11 + adc %r11, %r11 + sbb %eax, %eax - sub $4, %rcx - jnz .Ltop + add %ebp, %ebp + + sbb (%rsi,%rcx,8), %r8 + nop + sbb 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb 16(%rsi,%rcx,8), %r10 + sbb 24(%rsi,%rcx,8), %r11 + mov %r10, 16(%rdi,%rcx,8) + mov %r11, 24(%rdi,%rcx,8) + + sbb %ebp, %ebp + add $4, %rcx + js .Ltop + +.Lend: + + + sub %eax, %ebp + movslq %ebp, %rax -.Lend: shr $63, %rbp - add %eax, %eax - sbb $0, %rbp - mov %rbp, %rax pop %rbp ret .size __gmpn_rsblsh1_n,.-__gmpn_rsblsh1_n - diff --git a/ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s b/ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s index fe7d1d3930..e9f079a236 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s @@ -47,10 +47,13 @@ - - - - + + + + + + + @@ -87,30 +90,11 @@ - .text - .align 16, 0x90 - .globl __gmpn_rsblsh2_nc - .type __gmpn_rsblsh2_nc,@function - -__gmpn_rsblsh2_nc: - - push %rbp - mov %r8, %rax - neg %rax - xor %ebp, %ebp - mov (%rdx), %r8 - shrd $62, %r8, %rbp - mov %ecx, %r9d - and $3, %r9d - je .Lb00 - cmp $2, %r9d - jc .Lb01 - je .Lb10 - jmp .Lb11 - .size __gmpn_rsblsh2_nc,.-__gmpn_rsblsh2_nc + + .text .align 16, 0x90 .globl __gmpn_rsblsh2_n .type __gmpn_rsblsh2_n,@function @@ -118,96 +102,102 @@ __gmpn_rsblsh2_nc: __gmpn_rsblsh2_n: - push %rbp - xor %ebp, %ebp + push %r12 + push %r13 + push %r14 + push %r15 + mov (%rdx), %r8 - shrd $62, %r8, %rbp + lea (,%r8,4), %r12 + shr $62, %r8 + mov %ecx, %eax - and $3, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + and $3, %al je .Lb00 - cmp $2, %eax + cmp $2, %al jc .Lb01 je .Lb10 -.Lb11: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - mov 16(%rdx), %r10 - shrd $62, %r10, %r9 - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - sbb 16(%rsi), %r9 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, %rbp - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi - sbb %eax, %eax - sub $3, %rcx - ja .Ltop +.Lb11: mov 8(%rdx,%rcx,8), %r10 + lea (%r8,%r10,4), %r14 + shr $62, %r10 + mov 16(%rdx,%rcx,8), %r11 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + sub (%rsi,%rcx,8), %r12 + sbb 8(%rsi,%rcx,8), %r14 + sbb 16(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r14, 8(%rdi,%rcx,8) + mov %r15, 16(%rdi,%rcx,8) + add $3, %rcx + js .Ltop jmp .Lend -.Lb01: add %eax, %eax - sbb (%rsi), %rbp - mov %rbp, (%rdi) - mov %r8, %rbp - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - sbb %eax, %eax - sub $1, %rcx - ja .Ltop +.Lb01: mov %r8, %r11 + sub (%rsi,%rcx,8), %r12 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + add $1, %rcx + js .Ltop jmp .Lend -.Lb10: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, %rbp - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi - sbb %eax, %eax - sub $2, %rcx - ja .Ltop +.Lb10: mov 8(%rdx,%rcx,8), %r11 + lea (%r8,%r11,4), %r15 + shr $62, %r11 + sub (%rsi,%rcx,8), %r12 + sbb 8(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r15, 8(%rdi,%rcx,8) + add $2, %rcx + js .Ltop jmp .Lend +.Lb00: mov 8(%rdx,%rcx,8), %r9 + mov 16(%rdx,%rcx,8), %r10 + jmp .Le00 + .align 16, 0x90 -.Ltop: mov (%rdx), %r8 - shrd $62, %r8, %rbp -.Lb00: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - mov 16(%rdx), %r10 - shrd $62, %r10, %r9 - mov 24(%rdx), %r11 - shrd $62, %r11, %r10 - lea 32(%rdx), %rdx - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - sbb 16(%rsi), %r9 - sbb 24(%rsi), %r10 - lea 32(%rsi), %rsi - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, 24(%rdi) - mov %r11, %rbp - lea 32(%rdi), %rdi - sbb %eax, %eax - sub $4, %rcx - jnz .Ltop - -.Lend: shr $62, %rbp - add %eax, %eax - sbb $0, %rbp - mov %rbp, %rax - pop %rbp +.Ltop: mov 16(%rdx,%rcx,8), %r10 + mov (%rdx,%rcx,8), %r8 + mov 8(%rdx,%rcx,8), %r9 + lea (%r11,%r8,4), %r12 + shr $62, %r8 +.Le00: lea (%r8,%r9,4), %r13 + shr $62, %r9 + mov 24(%rdx,%rcx,8), %r11 + lea (%r9,%r10,4), %r14 + shr $62, %r10 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + add %eax, %eax + sbb (%rsi,%rcx,8), %r12 + sbb 8(%rsi,%rcx,8), %r13 + sbb 16(%rsi,%rcx,8), %r14 + sbb 24(%rsi,%rcx,8), %r15 + mov %r12, (%rdi,%rcx,8) + mov %r13, 8(%rdi,%rcx,8) + mov %r14, 16(%rdi,%rcx,8) + sbb %eax, %eax + mov %r15, 24(%rdi,%rcx,8) + add $4, %rcx + js .Ltop +.Lend: + + + add %r11d, %eax + movslq %eax, %rax + + pop %r15 + pop %r14 + pop %r13 + pop %r12 ret .size __gmpn_rsblsh2_n,.-__gmpn_rsblsh2_n diff --git a/ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s b/ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s index b64824b9f9..d439217a6c 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s @@ -66,32 +66,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - + @@ -103,7 +78,7 @@ .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_rsblsh_n .type __gmpn_rsblsh_n,@function @@ -111,142 +86,143 @@ __gmpn_rsblsh_n: + push %r12 + push %rbp + push %rbx + + mov (%rdx), %rax + + mov $0, %ebp + sub %rcx, %rbp + + lea -16(%rsi,%rcx,8), %rsi + lea -16(%rdi,%rcx,8), %rdi + lea 16(%rdx,%rcx,8), %r12 + + mov %rcx, %r9 + + mov %r8, %rcx + mov $1, %r8d + shl %cl, %r8 + + mul %r8 + + and $3, %r9d + jz .Lb0 + cmp $2, %r9d + jc .Lb1 + jz .Lb2 + +.Lb3: mov %rax, %r11 + sub 16(%rsi,%rbp,8), %r11 + mov -8(%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov (%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $3, %rbp + jnz .Llo3 + jmp .Lcj3 + +.Lb2: mov %rax, %rbx + mov -8(%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $2, %rbp + jz .Lcj2 + mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + xor %ecx, %ecx + jmp .Llo2 + +.Lb1: mov %rax, %r9 + mov %rdx, %r10 + add $1, %rbp + jnz .Lgt1 + sub 8(%rsi,%rbp,8), %r9 + jmp .Lcj1 +.Lgt1: mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + sub 8(%rsi,%rbp,8), %r9 + sbb 16(%rsi,%rbp,8), %r10 + sbb 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + jmp .Llo1 + +.Lb0: mov %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + sub 16(%rsi,%rbp,8), %r10 + sbb 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jz .Lend - mov (%rdx), %r10 - - mov %ecx, %eax - shr $3, %rcx - xor %r9d, %r9d - sub %r8, %r9 - and $7, %eax - - lea .Ltab(%rip), %r11 - - movslq (%r11,%rax,4), %rax - add %r11, %rax - jmp *%rax - - -.L0: lea 32(%rsi), %rsi - lea 32(%rdx), %rdx - lea 32(%rdi), %rdi - xor %r11d, %r11d - jmp .Le0 - -.L7: mov %r10, %r11 - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi - xor %r10d, %r10d - jmp .Le7 - -.L6: lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi - xor %r11d, %r11d - jmp .Le6 - -.L5: mov %r10, %r11 - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - xor %r10d, %r10d - jmp .Le5 - -.Lend: sbb 24(%rsi), %rax - mov %rax, -40(%rdi) - .byte 0xc4,194,179,0xf7,195 - sbb %rcx, %rax + .align 8, 0x90 +.Ltop: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Llo3: mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %r11, -8(%rdi,%rbp,8) +.Llo2: mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add %ecx, %ecx + sbb (%rsi,%rbp,8), %rbx + sbb 8(%rsi,%rbp,8), %r9 + sbb 16(%rsi,%rbp,8), %r10 + sbb 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rbx, (%rdi,%rbp,8) +.Llo1: mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov %r9, 8(%rdi,%rbp,8) +.Llo0: mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jnz .Ltop + +.Lend: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Lcj3: mov %r11, -8(%rdi,%rbp,8) +.Lcj2: add %ecx, %ecx + sbb (%rsi,%rbp,8), %rbx + sbb 8(%rsi,%rbp,8), %r9 + mov %rbx, (%rdi,%rbp,8) +.Lcj1: mov %r9, 8(%rdi,%rbp,8) + mov %rdx, %rax + sbb $0, %rax + pop %rbx + pop %rbp + pop %r12 ret - - .align 32, 0x90 -.Ltop: jrcxz .Lend - mov -32(%rdx), %r10 - sbb 24(%rsi), %rax - lea 64(%rsi), %rsi - .byte 0xc4,66,179,0xf7,219 - mov %rax, -40(%rdi) -.Le0: dec %rcx - .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov -24(%rdx), %r11 - sbb -32(%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, -32(%rdi) -.Le7: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov -16(%rdx), %r10 - sbb -24(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, -24(%rdi) -.Le6: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov -8(%rdx), %r11 - sbb -16(%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, -16(%rdi) -.Le5: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov (%rdx), %r10 - sbb -8(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, -8(%rdi) -.Le4: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov 8(%rdx), %r11 - sbb (%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, (%rdi) -.Le3: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov 16(%rdx), %r10 - sbb 8(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, 8(%rdi) -.Le2: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov 24(%rdx), %r11 - sbb 16(%rsi), %rax - lea 64(%rdx), %rdx - .byte 0xc4,66,179,0xf7,210 - mov %rax, 16(%rdi) - lea 64(%rdi), %rdi -.Le1: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - jmp .Ltop - -.L4: xor %r11d, %r11d - jmp .Le4 - -.L3: mov %r10, %r11 - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi - xor %r10d, %r10d - jmp .Le3 - -.L2: lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi - xor %r11d, %r11d - jmp .Le2 - -.L1: mov %r10, %r11 - lea -24(%rsi), %rsi - lea 40(%rdx), %rdx - lea 40(%rdi), %rdi - xor %r10d, %r10d - jmp .Le1 .size __gmpn_rsblsh_n,.-__gmpn_rsblsh_n - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab - .long .L4-.Ltab - .long .L5-.Ltab - .long .L6-.Ltab - .long .L7-.Ltab - diff --git a/ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s b/ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s index c385f661fc..8554f6f047 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s @@ -56,6 +56,8 @@ + + @@ -72,7 +74,6 @@ .text - .align 16, 0x90 .globl __gmpn_rsh1add_nc .type __gmpn_rsh1add_nc,@function @@ -82,12 +83,11 @@ __gmpn_rsh1add_nc: push %rbx - push %rbp + xor %eax, %eax neg %r8 - mov (%rsi), %rbp - adc (%rdx), %rbp - + mov (%rsi), %rbx + adc (%rdx), %rbx jmp .Lent .size __gmpn_rsh1add_nc,.-__gmpn_rsh1add_nc @@ -99,14 +99,13 @@ __gmpn_rsh1add_n: push %rbx - push %rbp - mov (%rsi), %rbp - add (%rdx), %rbp + xor %eax, %eax + mov (%rsi), %rbx + add (%rdx), %rbx .Lent: - sbb %ebx, %ebx - mov %ebp, %eax - and $1, %eax + rcr %rbx + adc %eax, %eax mov %ecx, %r11d and $3, %r11d @@ -116,21 +115,20 @@ __gmpn_rsh1add_n: .Ln1: cmp $2, %r11d jne .Ln2 - add %ebx, %ebx + add %rbx, %rbx mov 8(%rsi), %r10 adc 8(%rdx), %r10 lea 8(%rsi), %rsi lea 8(%rdx), %rdx lea 8(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r10, %rbp - mov %rbp, -8(%rdi) + rcr %r10 + rcr %rbx + mov %rbx, -8(%rdi) jmp .Lcj1 .Ln2: cmp $3, %r11d jne .Ln3 - add %ebx, %ebx + add %rbx, %rbx mov 8(%rsi), %r9 mov 16(%rsi), %r10 adc 8(%rdx), %r9 @@ -138,14 +136,14 @@ __gmpn_rsh1add_n: lea 16(%rsi), %rsi lea 16(%rdx), %rdx lea 16(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r9, %rbp - mov %rbp, -16(%rdi) + rcr %r10 + rcr %r9 + rcr %rbx + mov %rbx, -16(%rdi) jmp .Lcj2 -.Ln3: dec %rcx - add %ebx, %ebx +.Ln3: dec %rcx + add %rbx, %rbx mov 8(%rsi), %r8 mov 16(%rsi), %r9 adc 8(%rdx), %r8 @@ -155,21 +153,20 @@ __gmpn_rsh1add_n: lea 24(%rsi), %rsi lea 24(%rdx), %rdx lea 24(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r8, %rbp - mov %rbp, -24(%rdi) - shrd $1, %r9, %r8 + rcr %r10 + rcr %r9 + rcr %r8 + rcr %rbx + mov %rbx, -24(%rdi) mov %r8, -16(%rdi) -.Lcj2: shrd $1, %r10, %r9 - mov %r9, -8(%rdi) -.Lcj1: mov %r10, %rbp +.Lcj2: mov %r9, -8(%rdi) +.Lcj1: mov %r10, %rbx .Ldo: - shr $2, %rcx + shr $2, %rcx je .Lend .align 16, 0x90 -.Ltop: add %ebx, %ebx +.Ltop: add %rbx, %rbx mov 8(%rsi), %r8 mov 16(%rsi), %r9 @@ -183,25 +180,23 @@ __gmpn_rsh1add_n: lea 32(%rsi), %rsi lea 32(%rdx), %rdx - sbb %ebx, %ebx + rcr %r11 + rcr %r10 + rcr %r9 + rcr %r8 - shrd $1, %r8, %rbp - mov %rbp, (%rdi) - shrd $1, %r9, %r8 + rcr %rbx + mov %rbx, (%rdi) mov %r8, 8(%rdi) - shrd $1, %r10, %r9 mov %r9, 16(%rdi) - shrd $1, %r11, %r10 mov %r10, 24(%rdi) + mov %r11, %rbx - dec %rcx - mov %r11, %rbp lea 32(%rdi), %rdi + dec %rcx jne .Ltop -.Lend: shrd $1, %rbx, %rbp - mov %rbp, (%rdi) - pop %rbp +.Lend: mov %rbx, (%rdi) pop %rbx ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s b/ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s index 0d7ab328a6..ff06ece4bc 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s @@ -57,6 +57,8 @@ + + @@ -72,7 +74,6 @@ .text - .align 16, 0x90 .globl __gmpn_rsh1sub_nc .type __gmpn_rsh1sub_nc,@function @@ -82,12 +83,11 @@ __gmpn_rsh1sub_nc: push %rbx - push %rbp + xor %eax, %eax neg %r8 - mov (%rsi), %rbp - sbb (%rdx), %rbp - + mov (%rsi), %rbx + sbb (%rdx), %rbx jmp .Lent .size __gmpn_rsh1sub_nc,.-__gmpn_rsh1sub_nc @@ -99,14 +99,13 @@ __gmpn_rsh1sub_n: push %rbx - push %rbp - mov (%rsi), %rbp - sub (%rdx), %rbp + xor %eax, %eax + mov (%rsi), %rbx + sub (%rdx), %rbx .Lent: - sbb %ebx, %ebx - mov %ebp, %eax - and $1, %eax + rcr %rbx + adc %eax, %eax mov %ecx, %r11d and $3, %r11d @@ -116,21 +115,20 @@ __gmpn_rsh1sub_n: .Ln1: cmp $2, %r11d jne .Ln2 - add %ebx, %ebx + add %rbx, %rbx mov 8(%rsi), %r10 sbb 8(%rdx), %r10 lea 8(%rsi), %rsi lea 8(%rdx), %rdx lea 8(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r10, %rbp - mov %rbp, -8(%rdi) + rcr %r10 + rcr %rbx + mov %rbx, -8(%rdi) jmp .Lcj1 .Ln2: cmp $3, %r11d jne .Ln3 - add %ebx, %ebx + add %rbx, %rbx mov 8(%rsi), %r9 mov 16(%rsi), %r10 sbb 8(%rdx), %r9 @@ -138,14 +136,14 @@ __gmpn_rsh1sub_n: lea 16(%rsi), %rsi lea 16(%rdx), %rdx lea 16(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r9, %rbp - mov %rbp, -16(%rdi) + rcr %r10 + rcr %r9 + rcr %rbx + mov %rbx, -16(%rdi) jmp .Lcj2 -.Ln3: dec %rcx - add %ebx, %ebx +.Ln3: dec %rcx + add %rbx, %rbx mov 8(%rsi), %r8 mov 16(%rsi), %r9 sbb 8(%rdx), %r8 @@ -155,21 +153,20 @@ __gmpn_rsh1sub_n: lea 24(%rsi), %rsi lea 24(%rdx), %rdx lea 24(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r8, %rbp - mov %rbp, -24(%rdi) - shrd $1, %r9, %r8 + rcr %r10 + rcr %r9 + rcr %r8 + rcr %rbx + mov %rbx, -24(%rdi) mov %r8, -16(%rdi) -.Lcj2: shrd $1, %r10, %r9 - mov %r9, -8(%rdi) -.Lcj1: mov %r10, %rbp +.Lcj2: mov %r9, -8(%rdi) +.Lcj1: mov %r10, %rbx .Ldo: - shr $2, %rcx + shr $2, %rcx je .Lend .align 16, 0x90 -.Ltop: add %ebx, %ebx +.Ltop: add %rbx, %rbx mov 8(%rsi), %r8 mov 16(%rsi), %r9 @@ -183,25 +180,23 @@ __gmpn_rsh1sub_n: lea 32(%rsi), %rsi lea 32(%rdx), %rdx - sbb %ebx, %ebx + rcr %r11 + rcr %r10 + rcr %r9 + rcr %r8 - shrd $1, %r8, %rbp - mov %rbp, (%rdi) - shrd $1, %r9, %r8 + rcr %rbx + mov %rbx, (%rdi) mov %r8, 8(%rdi) - shrd $1, %r10, %r9 mov %r9, 16(%rdi) - shrd $1, %r11, %r10 mov %r10, 24(%rdi) + mov %r11, %rbx - dec %rcx - mov %r11, %rbp lea 32(%rdi), %rdi + dec %rcx jne .Ltop -.Lend: shrd $1, %rbx, %rbp - mov %rbp, (%rdi) - pop %rbp +.Lend: mov %rbx, (%rdi) pop %rbx ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/rshift.s b/ext/gmp/gen/x86_64-linux/mpn/rshift.s index 386eccd1ac..8ddd7b5557 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rshift.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rshift.s @@ -41,32 +41,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -89,142 +63,129 @@ .text - .align 64, 0x90 + .align 32, 0x90 .globl __gmpn_rshift .type __gmpn_rshift,@function __gmpn_rshift: - movd %ecx, %xmm4 - mov $64, %eax - sub %ecx, %eax - movd %eax, %xmm5 - - neg %ecx + neg %ecx mov (%rsi), %rax - shl %cl, %rax - - cmp $3, %rdx - jle .Lbc + shl %cl, %rax + neg %ecx - test $8, %dil - jz .Lrp_aligned - - - movq (%rsi), %xmm0 - movq 8(%rsi), %xmm1 - psrlq %xmm4, %xmm0 - psllq %xmm5, %xmm1 - por %xmm1, %xmm0 - movq %xmm0, (%rdi) - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - dec %rdx - -.Lrp_aligned: lea 1(%rdx), %r8d - lea (%rsi,%rdx,8), %rsi - lea (%rdi,%rdx,8), %rdi - neg %rdx - and $6, %r8d - jz .Lbu0 - cmp $4, %r8d - jz .Lbu4 - jc .Lbu2 -.Lbu6: add $4, %rdx - jmp .Li56 -.Lbu0: add $6, %rdx - jmp .Li70 -.Lbu4: add $2, %rdx - jmp .Li34 -.Lbu2: add $8, %rdx - jge .Lend + lea -8(%rsi,%rdx,8), %rsi + lea -8(%rdi,%rdx,8), %rdi + neg %rdx + and $3, %r8d + je .Lrlx + + dec %r8d + jne .L1 + + mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + shl %cl, %r8 + or %r8, %r10 + mov %r10, 8(%rdi,%rdx,8) + inc %rdx + jmp .Lrll + +.L1: dec %r8d + je .L1x + + mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + shl %cl, %r8 + or %r8, %r10 + mov %r10, 8(%rdi,%rdx,8) + inc %rdx + neg %ecx +.L1x: + cmp $-1, %rdx + je .Last + mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + mov 16(%rsi,%rdx,8), %r11 + shr %cl, %r11 + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + mov 24(%rsi,%rdx,8), %r9 + shl %cl, %r8 + or %r8, %r10 + shl %cl, %r9 + or %r9, %r11 + mov %r10, 8(%rdi,%rdx,8) + mov %r11, 16(%rdi,%rdx,8) + add $2, %rdx + +.Lrll: neg %ecx +.Lrlx: mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + mov 16(%rsi,%rdx,8), %r11 + shr %cl, %r11 + + add $4, %rdx + jb .Lend .align 16, 0x90 -.Ltop: movdqu -64(%rsi,%rdx,8), %xmm1 - movdqu -56(%rsi,%rdx,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -64(%rdi,%rdx,8) -.Li70: - movdqu -48(%rsi,%rdx,8), %xmm1 - movdqu -40(%rsi,%rdx,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -48(%rdi,%rdx,8) -.Li56: - movdqu -32(%rsi,%rdx,8), %xmm1 - movdqu -24(%rsi,%rdx,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -32(%rdi,%rdx,8) -.Li34: - movdqu -16(%rsi,%rdx,8), %xmm1 - movdqu -8(%rsi,%rdx,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -16(%rdi,%rdx,8) - add $8, %rdx - jl .Ltop - -.Lend: test $1, %dl - jnz .Le1 - - movdqu -16(%rsi), %xmm1 - movq -8(%rsi), %xmm0 - psrlq %xmm4, %xmm1 - psllq %xmm5, %xmm0 - por %xmm1, %xmm0 - movdqa %xmm0, -16(%rdi) +.Ltop: - ret - -.Le1: movq -8(%rsi), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, -8(%rdi) + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + mov -8(%rsi,%rdx,8), %r9 + shl %cl, %r8 + or %r8, %r10 + shl %cl, %r9 + or %r9, %r11 + mov %r10, -24(%rdi,%rdx,8) + mov %r11, -16(%rdi,%rdx,8) - ret + mov (%rsi,%rdx,8), %r8 + mov 8(%rsi,%rdx,8), %r9 + shl %cl, %r8 + shl %cl, %r9 - - .align 16, 0x90 -.Lbc: dec %edx - jnz 1f - movq (%rsi), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, (%rdi) - ret - -1: movq (%rsi), %xmm1 - movq 8(%rsi), %xmm0 - psrlq %xmm4, %xmm1 - psllq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, (%rdi) - dec %edx - jnz 1f - movq 8(%rsi), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, 8(%rdi) + neg %ecx + mov -8(%rsi,%rdx,8), %r10 + mov 0(%rsi,%rdx,8), %r11 + shr %cl, %r10 + or %r10, %r8 + shr %cl, %r11 + or %r11, %r9 + mov %r8, -8(%rdi,%rdx,8) + mov %r9, 0(%rdi,%rdx,8) - ret - -1: movq 8(%rsi), %xmm1 - movq 16(%rsi), %xmm0 - psrlq %xmm4, %xmm1 - psllq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, 8(%rdi) - movq 16(%rsi), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, 16(%rdi) + mov 8(%rsi,%rdx,8), %r10 + mov 16(%rsi,%rdx,8), %r11 + shr %cl, %r10 + shr %cl, %r11 + + add $4, %rdx + jae .Ltop +.Lend: + neg %ecx + mov -8(%rsi), %r8 + shl %cl, %r8 + or %r8, %r10 + mov (%rsi), %r9 + shl %cl, %r9 + or %r9, %r11 + mov %r10, -16(%rdi) + mov %r11, -8(%rdi) + + neg %ecx +.Last: mov (%rsi), %r10 + shr %cl, %r10 + mov %r10, (%rdi) ret .size __gmpn_rshift,.-__gmpn_rshift - diff --git a/ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s b/ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s index 6e67f45c31..7a50a70410 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s +++ b/ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s @@ -62,21 +62,6 @@ - - - - - - - - - - - - - - - @@ -103,131 +88,103 @@ __gmpn_sec_tabselect: - - - - - - - movd %r8, %xmm8 - pshufd $0, %xmm8, %xmm8 - mov $1, %eax - movd %rax, %xmm9 - pshufd $0, %xmm9, %xmm9 + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 mov %rdx, %r9 - add $-8, %r9 + add $-4, %r9 js .Louter_end .Louter_top: - mov %rcx, %r10 - mov %rsi, %r11 - pxor %xmm1, %xmm1 - pxor %xmm4, %xmm4 - pxor %xmm5, %xmm5 - pxor %xmm6, %xmm6 - pxor %xmm7, %xmm7 + mov %rcx, %rbp + push %rsi + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d + mov %r8, %rbx + .align 16, 0x90 -.Ltop: movdqa %xmm8, %xmm0 - pcmpeqd %xmm1, %xmm0 - paddd %xmm9, %xmm1 - movdqu 0(%rsi), %xmm2 - movdqu 16(%rsi), %xmm3 - pand %xmm0, %xmm2 - pand %xmm0, %xmm3 - por %xmm2, %xmm4 - por %xmm3, %xmm5 - movdqu 32(%rsi), %xmm2 - movdqu 48(%rsi), %xmm3 - pand %xmm0, %xmm2 - pand %xmm0, %xmm3 - por %xmm2, %xmm6 - por %xmm3, %xmm7 +.Ltop: sub $1, %rbx + sbb %rax, %rax + mov 0(%rsi), %r10 + mov 8(%rsi), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r12 + or %r11, %r13 + mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r14 + or %r11, %r15 lea (%rsi,%rdx,8), %rsi - add $-1, %r10 + add $-1, %rbp jne .Ltop - movdqu %xmm4, 0(%rdi) - movdqu %xmm5, 16(%rdi) - movdqu %xmm6, 32(%rdi) - movdqu %xmm7, 48(%rdi) - - lea 64(%r11), %rsi - lea 64(%rdi), %rdi - add $-8, %r9 + mov %r12, 0(%rdi) + mov %r13, 8(%rdi) + mov %r14, 16(%rdi) + mov %r15, 24(%rdi) + pop %rsi + lea 32(%rsi), %rsi + lea 32(%rdi), %rdi + add $-4, %r9 jns .Louter_top .Louter_end: - test $4, %dl - je .Lb0xx -.Lb1xx:mov %rcx, %r10 - mov %rsi, %r11 - pxor %xmm1, %xmm1 - pxor %xmm4, %xmm4 - pxor %xmm5, %xmm5 + test $2, %dl + jz .Lb0x +.Lb1x: mov %rcx, %rbp + push %rsi + xor %r12d, %r12d + xor %r13d, %r13d + mov %r8, %rbx .align 16, 0x90 -.Ltp4: movdqa %xmm8, %xmm0 - pcmpeqd %xmm1, %xmm0 - paddd %xmm9, %xmm1 - movdqu 0(%rsi), %xmm2 - movdqu 16(%rsi), %xmm3 - pand %xmm0, %xmm2 - pand %xmm0, %xmm3 - por %xmm2, %xmm4 - por %xmm3, %xmm5 +.Ltp2: sub $1, %rbx + sbb %rax, %rax + mov 0(%rsi), %r10 + mov 8(%rsi), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r12 + or %r11, %r13 lea (%rsi,%rdx,8), %rsi - add $-1, %r10 - jne .Ltp4 - movdqu %xmm4, 0(%rdi) - movdqu %xmm5, 16(%rdi) - lea 32(%r11), %rsi - lea 32(%rdi), %rdi - -.Lb0xx:test $2, %dl - je .Lb00x -.Lb01x:mov %rcx, %r10 - mov %rsi, %r11 - pxor %xmm1, %xmm1 - pxor %xmm4, %xmm4 - .align 16, 0x90 -.Ltp2: movdqa %xmm8, %xmm0 - pcmpeqd %xmm1, %xmm0 - paddd %xmm9, %xmm1 - movdqu 0(%rsi), %xmm2 - pand %xmm0, %xmm2 - por %xmm2, %xmm4 - lea (%rsi,%rdx,8), %rsi - add $-1, %r10 + add $-1, %rbp jne .Ltp2 - movdqu %xmm4, 0(%rdi) - lea 16(%r11), %rsi + mov %r12, 0(%rdi) + mov %r13, 8(%rdi) + pop %rsi + lea 16(%rsi), %rsi lea 16(%rdi), %rdi -.Lb00x:test $1, %dl - je .Lb000 -.Lb001:mov %rcx, %r10 - mov %rsi, %r11 - pxor %xmm1, %xmm1 - pxor %xmm4, %xmm4 +.Lb0x: test $1, %dl + jz .Lb00 +.Lb01: mov %rcx, %rbp + xor %r12d, %r12d + mov %r8, %rbx .align 16, 0x90 -.Ltp1: movdqa %xmm8, %xmm0 - pcmpeqd %xmm1, %xmm0 - paddd %xmm9, %xmm1 - movq 0(%rsi), %xmm2 - pand %xmm0, %xmm2 - por %xmm2, %xmm4 +.Ltp1: sub $1, %rbx + sbb %rax, %rax + mov 0(%rsi), %r10 + and %rax, %r10 + or %r10, %r12 lea (%rsi,%rdx,8), %rsi - add $-1, %r10 + add $-1, %rbp jne .Ltp1 - movq %xmm4, 0(%rdi) - -.Lb000: - - - - - + mov %r12, 0(%rdi) + +.Lb00: pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx ret .size __gmpn_sec_tabselect,.-__gmpn_sec_tabselect - diff --git a/ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s b/ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s index 26efdaa53a..eb24851327 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s +++ b/ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s @@ -89,6 +89,11 @@ + + + + + @@ -103,746 +108,711 @@ __gmpn_sqr_basecase: + mov %edx, %ecx + mov %edx, %r11d + + add $-40, %rsp + + and $3, %ecx + cmp $4, %edx + lea 4(%rcx), %r8 - cmp $2, %rdx - jae .Lgt1 + mov %rbx, 32(%rsp) + mov %rbp, 24(%rsp) + mov %r12, 16(%rsp) + mov %r13, 8(%rsp) + mov %r14, (%rsp) + + cmovg %r8, %rcx + + lea .Ltab(%rip), %rax + movslq (%rax,%rcx,4), %r10 + add %r10, %rax + jmp *%rax + + .section .data.rel.ro.local,"a",@progbits + .align 8, 0x90 +.Ltab: .long .L4-.Ltab + .long .L1-.Ltab + .long .L2-.Ltab + .long .L3-.Ltab + .long .L0m4-.Ltab + .long .L1m4-.Ltab + .long .L2m4-.Ltab + .long .L3m4-.Ltab + .text - mov (%rsi), %rdx - .byte 0xc4,226,251,0xf6,210 +.L1: mov (%rsi), %rax + mul %rax + add $40, %rsp mov %rax, (%rdi) mov %rdx, 8(%rdi) ret -.Lgt1: jne .Lgt2 - - mov (%rsi), %rdx - mov 8(%rsi), %rcx - .byte 0xc4,98,179,0xf6,209 - .byte 0xc4,98,251,0xf6,194 - mov %rcx, %rdx - .byte 0xc4,226,163,0xf6,210 - add %r9, %r9 - adc %r10, %r10 - adc $0, %rdx - add %r9, %r8 - adc %r11, %r10 - adc $0, %rdx +.L2: mov (%rsi), %rax + mov %rax, %r8 + mul %rax + mov 8(%rsi), %r11 mov %rax, (%rdi) - mov %r8, 8(%rdi) + mov %r11, %rax + mov %rdx, %r9 + mul %rax + add $40, %rsp + mov %rax, %r10 + mov %r11, %rax + mov %rdx, %r11 + mul %r8 + xor %r8, %r8 + add %rax, %r9 + adc %rdx, %r10 + adc %r8, %r11 + add %rax, %r9 + mov %r9, 8(%rdi) + adc %rdx, %r10 mov %r10, 16(%rdi) + adc %r8, %r11 + mov %r11, 24(%rdi) + + ret + +.L3: mov (%rsi), %rax + mov %rax, %r10 + mul %rax + mov 8(%rsi), %r11 + mov %rax, (%rdi) + mov %r11, %rax + mov %rdx, 8(%rdi) + mul %rax + mov 16(%rsi), %rcx + mov %rax, 16(%rdi) + mov %rcx, %rax mov %rdx, 24(%rdi) + mul %rax + mov %rax, 32(%rdi) + mov %rdx, 40(%rdi) + + mov %r11, %rax + mul %r10 + mov %rax, %r8 + mov %rcx, %rax + mov %rdx, %r9 + mul %r10 + xor %r10, %r10 + add %rax, %r9 + mov %r11, %rax + mov %r10, %r11 + adc %rdx, %r10 + + mul %rcx + add $40, %rsp + add %rax, %r10 + adc %r11, %rdx + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %rdx, %rdx + adc %r11, %r11 + add %r8, 8(%rdi) + adc %r9, 16(%rdi) + adc %r10, 24(%rdi) + adc %rdx, 32(%rdi) + adc %r11, 40(%rdi) ret -.Lgt2: cmp $4, %rdx - jae .Lgt3 - - push %rbx - mov (%rsi), %rdx - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xc4,98,187,0xf6,78,16 - add %r11, %r8 - mov 8(%rsi), %rdx - .byte 0xc4,98,251,0xf6,94,16 - adc %rax, %r9 - adc $0, %r11 - test %ebx, %ebx - mov (%rsi), %rdx - .byte 0xc4,226,227,0xf6,202 - mov %rbx, (%rdi) - mov 8(%rsi), %rdx - .byte 0xc4,226,251,0xf6,218 - mov 16(%rsi), %rdx - .byte 0xc4,226,203,0xf6,210 - .byte 0x66,77,0x0f,0x38,0xf6,210 - .byte 0x66,77,0x0f,0x38,0xf6,192 - .byte 0x66,77,0x0f,0x38,0xf6,201 - .byte 0x66,77,0x0f,0x38,0xf6,219 - .byte 0xf3,73,0x0f,0x38,0xf6,202 - .byte 0xf3,73,0x0f,0x38,0xf6,192 - .byte 0xf3,73,0x0f,0x38,0xf6,217 - .byte 0xf3,73,0x0f,0x38,0xf6,243 - mov $0, %r8d - .byte 0xf3,73,0x0f,0x38,0xf6,208 - .byte 0x66,73,0x0f,0x38,0xf6,208 - mov %rcx, 8(%rdi) +.L4: mov (%rsi), %rax + mov %rax, %r11 + mul %rax + mov 8(%rsi), %rbx + mov %rax, (%rdi) + mov %rbx, %rax + mov %rdx, 8(%rdi) + mul %rax mov %rax, 16(%rdi) - mov %rbx, 24(%rdi) - mov %rsi, 32(%rdi) + mov %rdx, 24(%rdi) + mov 16(%rsi), %rax + mul %rax + mov %rax, 32(%rdi) mov %rdx, 40(%rdi) + mov 24(%rsi), %rax + mul %rax + mov %rax, 48(%rdi) + mov %rbx, %rax + mov %rdx, 56(%rdi) + + mul %r11 + add $32, %rsp + mov %rax, %r8 + mov %rdx, %r9 + mov 16(%rsi), %rax + mul %r11 + xor %r10, %r10 + add %rax, %r9 + adc %rdx, %r10 + mov 24(%rsi), %rax + mul %r11 + xor %r11, %r11 + add %rax, %r10 + adc %rdx, %r11 + mov 16(%rsi), %rax + mul %rbx + xor %rcx, %rcx + add %rax, %r10 + adc %rdx, %r11 + adc $0, %rcx + mov 24(%rsi), %rax + mul %rbx pop %rbx + add %rax, %r11 + adc %rdx, %rcx + mov 16(%rsi), %rdx + mov 24(%rsi), %rax + mul %rdx + add %rax, %rcx + adc $0, %rdx + + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %r11, %r11 + adc %rcx, %rcx + mov $0, %eax + adc %rdx, %rdx + + adc %rax, %rax + add %r8, 8(%rdi) + adc %r9, 16(%rdi) + adc %r10, 24(%rdi) + adc %r11, 32(%rdi) + adc %rcx, 40(%rdi) + adc %rdx, 48(%rdi) + adc %rax, 56(%rdi) ret -.Lgt3: push %rbx - - lea -3(%rdx), %ebx - lea 5(%rdx), %ecx - mov %edx, %eax - and $-8, %ebx - shr $3, %ecx - neg %rbx - and $7, %eax - mov (%rsi), %rdx +.L0m4: + lea -16(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea (%rsi,%r11,8), %rsi - lea .Lmtab(%rip), %r10 - movslq (%r10,%rax,4), %r8 - lea (%r8, %r10), %r10 - jmp *%r10 + lea -4(%r11), %r8 + xor %r9d, %r9d + sub %r11, %r9 -.Lmf0: .byte 0xc4,98,187,0xf6,202 - add %rdx, %rdx - .byte 0xc4,98,171,0xf6,94,8 - lea 64(%rsi), %rsi - add %r9, %r10 - jmp .Lmb0 - -.Lmf3: .byte 0xc4,98,171,0xf6,218 - add %rdx, %rdx - mov %r10, (%rdi) - .byte 0xc4,98,187,0xf6,78,8 - lea 24(%rsi), %rsi - lea 24(%rdi), %rdi - add %r11, %r8 - jmp .Lmb3 - -.Lmf4: .byte 0xc4,98,187,0xf6,202 - add %rdx, %rdx - .byte 0xc4,98,171,0xf6,94,8 - mov %r8, (%rdi) - lea 32(%rsi), %rsi - lea 32(%rdi), %rdi - add %r9, %r10 - jmp .Lmb4 - -.Lmf5: .byte 0xc4,98,171,0xf6,218 - add %rdx, %rdx - .byte 0xc4,98,187,0xf6,78,8 - mov %r10, (%rdi) - lea 40(%rsi), %rsi - lea 40(%rdi), %rdi - add %r11, %r8 - jmp .Lmb5 - -.Lmf6: .byte 0xc4,98,187,0xf6,202 - add %rdx, %rdx - .byte 0xc4,98,171,0xf6,94,8 - mov %r8, (%rdi) - lea 48(%rsi), %rsi - lea 48(%rdi), %rdi - add %r9, %r10 - jmp .Lmb6 - -.Lmf7: .byte 0xc4,98,171,0xf6,218 - add %rdx, %rdx - .byte 0xc4,98,187,0xf6,78,8 - mov %r10, (%rdi) - lea 56(%rsi), %rsi - lea 56(%rdi), %rdi - add %r11, %r8 - jmp .Lmb7 - -.Lmf1: .byte 0xc4,98,171,0xf6,218 - add %rdx, %rdx - .byte 0xc4,98,187,0xf6,78,8 - mov %r10, (%rdi) - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - add %r11, %r8 - jmp .Lmb1 - -.Lmf2: .byte 0xc4,98,187,0xf6,202 - add %rdx, %rdx - .byte 0xc4,98,171,0xf6,94,8 - mov %r8, (%rdi) - lea 16(%rsi), %rsi - lea 16(%rdi), %rdi - dec %ecx - add %r9, %r10 - .byte 0xc4,98,187,0xf6,14 - - .align 16, 0x90 -.Ltop: mov %r10, -8(%rdi) - adc %r11, %r8 -.Lmb1: .byte 0xc4,98,171,0xf6,94,8 - adc %r9, %r10 - lea 64(%rsi), %rsi -.Lmb0: mov %r8, (%rdi) - mov %r10, 8(%rdi) - .byte 0xc4,98,187,0xf6,78,208 - lea 64(%rdi), %rdi - adc %r11, %r8 -.Lmb7: .byte 0xc4,98,171,0xf6,94,216 - mov %r8, -48(%rdi) - adc %r9, %r10 -.Lmb6: mov %r10, -40(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - adc %r11, %r8 -.Lmb5: .byte 0xc4,98,171,0xf6,94,232 - mov %r8, -32(%rdi) - adc %r9, %r10 -.Lmb4: .byte 0xc4,98,187,0xf6,78,240 - mov %r10, -24(%rdi) - adc %r11, %r8 -.Lmb3: .byte 0xc4,98,171,0xf6,94,248 - adc %r9, %r10 - mov %r8, -16(%rdi) - dec %ecx - .byte 0xc4,98,187,0xf6,14 - jnz .Ltop - -.Lend: mov %r10, -8(%rdi) - adc %r11, %r8 - - - - - lea .Latab(%rip), %r10 - movslq (%r10,%rax,4), %r11 - lea (%r11, %r10), %r11 - - mov $63, %eax - jmp *%r11 - -.Led0: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf7: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea -64(%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov (%rsi), %r9 - mov 8(%rsi), %rdx - .byte 0xc4,66,251,0xf7,193 - .byte 0xc4,66,250,0xf7,201 - and %rdx, %r9 - .byte 0xc4,98,171,0xf6,218 - lea (%r8,%rdx,2), %rdx - jmp .Lb7 - - .align 16, 0x90 -.Ltp0: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led0 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx -.Lb0: mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp0 - -.Led1: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf0: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea -64(%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov -8(%rsi), %r11 - mov (%rsi), %rdx - .byte 0xc4,66,251,0xf7,211 - .byte 0xc4,66,250,0xf7,219 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - lea (%r10,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - jmp .Lb0 + mul %r13 + xor %ebp, %ebp + mov %rax, %rbx + mov 16(%rsi,%r9,8), %rax + mov %rdx, %r10 + jmp .LL3 .align 16, 0x90 -.Ltp1: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led1 -.Lb1: .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp1 - -.Led2: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf1: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea 8(%rbx), %rbx - lea -56(%rdi,%rbx,8), %rdi - mov -16(%rsi), %r9 - mov -8(%rsi), %rdx - .byte 0xc4,66,251,0xf7,193 - .byte 0xc4,66,250,0xf7,201 - and %rdx, %r9 - .byte 0xc4,98,171,0xf6,218 - lea (%r8,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jmp .Lb1 +.Lmul_1_m3_top: + add %rax, %rbp + mov %r10, (%r12,%r9,8) + mov (%rsi,%r9,8), %rax + adc %rdx, %rcx + xor %ebx, %ebx + mul %r13 + xor %r10d, %r10d + mov %rbp, 8(%r12,%r9,8) + add %rax, %rcx + adc %rdx, %rbx + mov 8(%rsi,%r9,8), %rax + mov %rcx, 16(%r12,%r9,8) + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %r10 +.LL3: xor %ecx, %ecx + mul %r13 + add %rax, %r10 + mov 24(%rsi,%r9,8), %rax + adc %rdx, %rbp + mov %rbx, 24(%r12,%r9,8) + mul %r13 + add $4, %r9 + js .Lmul_1_m3_top + + add %rax, %rbp + mov %r10, (%r12) + adc %rdx, %rcx + mov %rbp, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 + lea -8(%rsi), %rsi + jmp .Ldowhile + + +.L1m4: + lea 8(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea 8(%rsi,%r11,8), %rsi + + lea -3(%r11), %r8 + + lea -3(%r11), %r9 + neg %r9 + + mov %rax, %r14 + mul %r13 + mov %rdx, %rcx + xor %ebp, %ebp + mov %rax, 8(%rdi) + jmp .Lm0 .align 16, 0x90 -.Ltp2: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led2 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) -.Lb2: .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp2 - -.Led3: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf2: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - or %ebx, %ecx - jz .Lcor3 - lea -56(%rdi,%rbx,8), %rdi - mov -24(%rsi), %r11 - mov -16(%rsi), %rdx - .byte 0xc4,66,251,0xf7,211 - .byte 0xc4,66,250,0xf7,219 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - lea (%r10,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,195 - jmp .Lb2 +.Lmul_2_m0_top: + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov -24(%rsi,%r9,8), %rax + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rax, %rcx + mov %rbx, -24(%r12,%r9,8) + adc %rdx, %rbp +.Lm0: mov -16(%rsi,%r9,8), %rax + mul %r13 + mov $0, %r10d + add %rax, %rcx + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + adc $0, %r10d + mov $0, %ebx + mov %rcx, -16(%r12,%r9,8) + mul %r14 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mov $0, %ecx + mul %r13 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r14 + add %rax, %r10 + mov %rbp, -8(%r12,%r9,8) + adc %rdx, %rbx +.Lm2x: mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + add $4, %r9 + mov -32(%rsi,%r9,8), %rax + mov %r10, -32(%r12,%r9,8) + js .Lmul_2_m0_top + + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov %rbx, -8(%r12) + mov %rcx, (%r12) + + lea -16(%rsi), %rsi + lea 0(%r12), %r12 + jmp .Ldowhile_end + + +.L2m4: + lea -16(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea (%rsi,%r11,8), %rsi + + lea -4(%r11), %r8 + + lea -2(%r11), %r9 + neg %r9 + + mul %r13 + mov %rax, %rbp + mov (%rsi,%r9,8), %rax + mov %rdx, %rcx + jmp .LL1 .align 16, 0x90 -.Ltp3: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led3 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) -.Lb3: .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp3 - -.Led4: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf3: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov -32(%rsi), %r9 - mov -24(%rsi), %rdx - .byte 0xc4,66,251,0xf7,193 - .byte 0xc4,66,250,0xf7,201 - and %rdx, %r9 - .byte 0xc4,98,171,0xf6,218 - lea (%r8,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,209 - jmp .Lb3 +.Lmul_1_m1_top: + add %rax, %rbp + mov %r10, (%r12,%r9,8) + mov (%rsi,%r9,8), %rax + adc %rdx, %rcx +.LL1: xor %ebx, %ebx + mul %r13 + xor %r10d, %r10d + mov %rbp, 8(%r12,%r9,8) + add %rax, %rcx + adc %rdx, %rbx + mov 8(%rsi,%r9,8), %rax + mov %rcx, 16(%r12,%r9,8) + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %r10 + xor %ecx, %ecx + mul %r13 + add %rax, %r10 + mov 24(%rsi,%r9,8), %rax + adc %rdx, %rbp + mov %rbx, 24(%r12,%r9,8) + mul %r13 + add $4, %r9 + js .Lmul_1_m1_top + + add %rax, %rbp + mov %r10, (%r12) + adc %rdx, %rcx + mov %rbp, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 + lea -8(%rsi), %rsi + jmp .Ldowhile_mid + + +.L3m4: + lea 8(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea 8(%rsi,%r11,8), %rsi + + lea -5(%r11), %r8 + + lea -1(%r11), %r9 + neg %r9 + + mov %rax, %r14 + mul %r13 + mov %rdx, %r10 + xor %ebx, %ebx + xor %ecx, %ecx + mov %rax, 8(%rdi) + jmp .Lm2 .align 16, 0x90 -.Ltp4: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led4 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) -.Lb4: .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp4 - -.Led5: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf4: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov -40(%rsi), %r11 - mov -32(%rsi), %rdx - .byte 0xc4,66,251,0xf7,211 - .byte 0xc4,66,250,0xf7,219 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - lea (%r10,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,195 - jmp .Lb4 +.Lmul_2_m2_top: + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov -24(%rsi,%r9,8), %rax + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rax, %rcx + mov %rbx, -24(%r12,%r9,8) + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + mul %r13 + mov $0, %r10d + add %rax, %rcx + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + adc $0, %r10d + mov $0, %ebx + mov %rcx, -16(%r12,%r9,8) + mul %r14 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mov $0, %ecx + mul %r13 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r14 + add %rax, %r10 + mov %rbp, -8(%r12,%r9,8) + adc %rdx, %rbx +.Lm2: mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + add $4, %r9 + mov -32(%rsi,%r9,8), %rax + mov %r10, -32(%r12,%r9,8) + js .Lmul_2_m2_top + + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov %rbx, -8(%r12) + mov %rcx, (%r12) + + lea -16(%rsi), %rsi + jmp .Ldowhile_mid + +.Ldowhile: + + lea 4(%r8), %r9 + neg %r9 + + mov 16(%rsi,%r9,8), %r13 + mov 24(%rsi,%r9,8), %r14 + mov 24(%rsi,%r9,8), %rax + mul %r13 + xor %r10d, %r10d + add %rax, 24(%r12,%r9,8) + adc %rdx, %r10 + xor %ebx, %ebx + xor %ecx, %ecx + jmp .Lam2 .align 16, 0x90 -.Ltp5: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led5 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) -.Lb5: .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp5 - -.Led6: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf5: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov -48(%rsi), %r9 - mov -40(%rsi), %rdx - .byte 0xc4,66,251,0xf7,193 - .byte 0xc4,66,250,0xf7,201 - and %rdx, %r9 - .byte 0xc4,98,171,0xf6,218 - lea (%r8,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,209 - jmp .Lb5 +.Laddmul_2_m2_top: + add %r10, (%r12,%r9,8) + adc %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rbx, 8(%r12,%r9,8) + adc %rax, %rcx + adc %rdx, %rbp + mov 16(%rsi,%r9,8), %rax + mov $0, %r10d + mul %r13 + add %rax, %rcx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r14 + add %rcx, 16(%r12,%r9,8) + adc %rax, %rbp + mov 24(%rsi,%r9,8), %rax + adc %rdx, %r10 + mul %r13 + mov $0, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %ecx + mov 24(%rsi,%r9,8), %rax + adc $0, %ebx + mul %r14 + add %rbp, 24(%r12,%r9,8) + adc %rax, %r10 + adc %rdx, %rbx +.Lam2: mov 32(%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + mov 32(%rsi,%r9,8), %rax + adc %rdx, %rbx + adc $0, %ecx + mul %r14 + add $4, %r9 + js .Laddmul_2_m2_top + + add %r10, (%r12) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 + + add $-2, %r8d + +.Ldowhile_mid: + + lea 2(%r8), %r9 + neg %r9 + + mov (%rsi,%r9,8), %r13 + mov 8(%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %rax + mul %r13 + xor %ecx, %ecx + add %rax, 8(%r12,%r9,8) + adc %rdx, %rcx + xor %ebp, %ebp + jmp .L20 .align 16, 0x90 -.Ltp6: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led6 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi -.Lb6: .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp6 - -.Led7: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf6: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov -56(%rsi), %r11 - mov -48(%rsi), %rdx - .byte 0xc4,66,251,0xf7,211 - .byte 0xc4,66,250,0xf7,219 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - lea (%r10,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xc4,98,171,0xf6,94,216 - jmp .Lb6 +.Laddmul_2_m0_top: + add %r10, (%r12,%r9,8) + adc %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rbx, 8(%r12,%r9,8) + adc %rax, %rcx + adc %rdx, %rbp +.L20: mov 16(%rsi,%r9,8), %rax + mov $0, %r10d + mul %r13 + add %rax, %rcx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r14 + add %rcx, 16(%r12,%r9,8) + adc %rax, %rbp + mov 24(%rsi,%r9,8), %rax + adc %rdx, %r10 + mul %r13 + mov $0, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %ecx + mov 24(%rsi,%r9,8), %rax + adc $0, %ebx + mul %r14 + add %rbp, 24(%r12,%r9,8) + adc %rax, %r10 + adc %rdx, %rbx + mov 32(%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + mov 32(%rsi,%r9,8), %rax + adc %rdx, %rbx + adc $0, %ecx + mul %r14 + add $4, %r9 + js .Laddmul_2_m0_top + + add %r10, (%r12) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 +.Ldowhile_end: + + add $-2, %r8d + jne .Ldowhile + + + mov -16(%rsi), %r13 + mov -8(%rsi), %r14 + mov -8(%rsi), %rax + mul %r13 + xor %r10d, %r10d + add %rax, -8(%r12) + adc %rdx, %r10 + xor %ebx, %ebx + xor %ecx, %ecx + mov (%rsi), %rax + mul %r13 + add %rax, %r10 + mov (%rsi), %rax + adc %rdx, %rbx + mul %r14 + add %r10, (%r12) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%r12) + mov %rcx, 16(%r12) + + + lea -4(%r11,%r11), %r9 + + mov 8(%rdi), %r11 + lea -8(%rsi), %rsi + lea (%rdi,%r9,8), %rdi + neg %r9 + mov (%rsi,%r9,4), %rax + mul %rax + test $2, %r9b + jnz .Lodd + +.Levn: add %r11, %r11 + sbb %ebx, %ebx + add %rdx, %r11 + mov %rax, (%rdi,%r9,8) + jmp .Ld0 + +.Lodd: add %r11, %r11 + sbb %ebp, %ebp + add %rdx, %r11 + mov %rax, (%rdi,%r9,8) + lea -2(%r9), %r9 + jmp .Ld1 .align 16, 0x90 -.Ltp7: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led7 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) -.Lb7: .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp7 - -.Lcor3:lea -64(%rdi), %rdi - mov -24(%rsi), %r11 - mov -16(%rsi), %rdx - .byte 0xc4,66,251,0xf7,211 - .byte 0xc4,66,250,0xf7,219 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - lea (%r10,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,71,56 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 56(%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,227,0xf6,14 - .byte 0xf3,76,0x0f,0x38,0xf6,87,64 - .byte 0x66,73,0x0f,0x38,0xf6,219 - mov %r10, 64(%rdi) - .byte 0xf3,72,0x0f,0x38,0xf6,95,72 - .byte 0xf3,76,0x0f,0x38,0xf6,201 - adc %rcx, %r9 - mov %r9, 80(%rdi) - - mov -16(%rsi), %r9 - mov -8(%rsi), %rdx - .byte 0xc4,66,251,0xf7,193 - .byte 0xc4,66,250,0xf7,201 - and %rdx, %r9 - .byte 0xc4,98,171,0xf6,218 - lea (%r8,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,226,187,0xf6,6 - .byte 0xf3,76,0x0f,0x38,0xf6,211 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 72(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,80 - .byte 0xf3,72,0x0f,0x38,0xf6,193 - mov %r8, 80(%rdi) - adc %rcx, %rax - - mov -8(%rsi), %r11 - mov (%rsi), %rdx - sar $63, %r11 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,192 - mov %r8, 88(%rdi) - .byte 0x66,76,0x0f,0x38,0xf6,201 - .byte 0xf3,76,0x0f,0x38,0xf6,201 - mov %r9, 96(%rdi) +.Ltop: mov (%rsi,%r9,4), %rax + mul %rax + add %ebp, %ebp + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (%rdi,%r9,8) +.Ld0: mov %r11, 8(%rdi,%r9,8) + mov 16(%rdi,%r9,8), %r10 + adc %r10, %r10 + mov 24(%rdi,%r9,8), %r11 + adc %r11, %r11 + nop + sbb %ebp, %ebp + mov 8(%rsi,%r9,4), %rax + mul %rax + add %ebx, %ebx + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, 16(%rdi,%r9,8) +.Ld1: mov %r11, 24(%rdi,%r9,8) + mov 32(%rdi,%r9,8), %r10 + adc %r10, %r10 + mov 40(%rdi,%r9,8), %r11 + adc %r11, %r11 + sbb %ebx, %ebx + add $4, %r9 + js .Ltop + + mov (%rsi), %rax + mul %rax + add %ebp, %ebp + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (%rdi) + mov %r11, 8(%rdi) + mov 16(%rdi), %r10 + adc %r10, %r10 + sbb %ebp, %ebp + neg %ebp + mov 8(%rsi), %rax + mul %rax + add %ebx, %ebx + adc %rax, %r10 + adc %rbp, %rdx + mov %r10, 16(%rdi) + mov %rdx, 24(%rdi) + pop %r14 + pop %r13 + pop %r12 + pop %rbp pop %rbx ret - - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Lmtab:.long .Lmf7-.Lmtab - .long .Lmf0-.Lmtab - .long .Lmf1-.Lmtab - .long .Lmf2-.Lmtab - .long .Lmf3-.Lmtab - .long .Lmf4-.Lmtab - .long .Lmf5-.Lmtab - .long .Lmf6-.Lmtab -.Latab:.long .Lf6-.Latab - .long .Lf7-.Latab - .long .Lf0-.Latab - .long .Lf1-.Latab - .long .Lf2-.Latab - .long .Lf3-.Latab - .long .Lf4-.Latab - .long .Lf5-.Latab - .text .size __gmpn_sqr_basecase,.-__gmpn_sqr_basecase diff --git a/ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s b/ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s index 7db64b894e..cbef8af042 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s @@ -189,20 +189,20 @@ __gmpn_sub_err1_n: .align 32, 0x90 .Lloop: - mov (%rsi,%r9,8), %r14 shr $1, %al mov -8(%r8), %r10 mov $0, %r13d + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 sbb (%rdx,%r9,8), %r14 cmovnc %r13, %r10 - mov %r14, (%rdi,%r9,8) - mov 8(%rsi,%r9,8), %r15 - mov 16(%rsi,%r9,8), %r14 sbb 8(%rdx,%r9,8), %r15 mov -16(%r8), %r11 + mov %r14, (%rdi,%r9,8) + mov 16(%rsi,%r9,8), %r14 + mov %r15, 8(%rdi,%r9,8) cmovnc %r13, %r11 mov -24(%r8), %r12 - mov %r15, 8(%rdi,%r9,8) sbb 16(%rdx,%r9,8), %r14 cmovnc %r13, %r12 mov 24(%rsi,%r9,8), %r15 @@ -215,12 +215,12 @@ __gmpn_sub_err1_n: adc $0, %rbp add %r12, %rbx adc $0, %rbp - lea -32(%r8), %r8 mov %r14, 16(%rdi,%r9,8) add %r13, %rbx + lea -32(%r8), %r8 adc $0, %rbp + mov %r15, 24(%rdi,%r9,8) add $4, %r9 - mov %r15, -8(%rdi,%r9,8) jnz .Lloop .Lend: diff --git a/ext/gmp/gen/x86_64-linux/mpn/sub_n.s b/ext/gmp/gen/x86_64-linux/mpn/sub_n.s index 2ae18233ca..8c1db0a02f 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/sub_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/sub_n.s @@ -94,20 +94,18 @@ __gmpn_sub_nc: - mov %ecx, %eax - shr $3, %rcx - and $7, %eax - - lea .Ltab(%rip), %r9 - neg %r8 + shr $2, %rcx + and $3, %eax + bt $0, %r8 + jrcxz .Llt4 - movslq (%r9,%rax,4), %rax - lea (%r9,%rax), %rax - jmp *%rax + mov (%rsi), %r8 + mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid .size __gmpn_sub_nc,.-__gmpn_sub_nc - .align 16, 0x90 .globl __gmpn_sub_n .type __gmpn_sub_n,@function @@ -115,159 +113,82 @@ __gmpn_sub_nc: __gmpn_sub_n: - mov %ecx, %eax - shr $3, %rcx - and $7, %eax - - lea .Ltab(%rip), %r9 + shr $2, %rcx + and $3, %eax + jrcxz .Llt4 - movslq (%r9,%rax,4), %rax - lea (%r9,%rax), %rax - jmp *%rax - - -.L0: mov (%rsi), %r8 + mov (%rsi), %r8 mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid + +.Llt4: dec %eax + mov (%rsi), %r8 + jnz .L2 sbb (%rdx), %r8 - jmp .Le0 + mov %r8, (%rdi) + adc %eax, %eax + + ret -.L4: mov (%rsi), %r8 +.L2: dec %eax mov 8(%rsi), %r9 + jnz .L3 sbb (%rdx), %r8 - lea -32(%rsi), %rsi - lea -32(%rdx), %rdx - lea -32(%rdi), %rdi - inc %rcx - jmp .Le4 - -.L5: mov (%rsi), %r11 - mov 8(%rsi), %r8 - mov 16(%rsi), %r9 - sbb (%rdx), %r11 - lea -24(%rsi), %rsi - lea -24(%rdx), %rdx - lea -24(%rdi), %rdi - inc %rcx - jmp .Le5 - -.L6: mov (%rsi), %r10 - sbb (%rdx), %r10 - mov 8(%rsi), %r11 - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi - inc %rcx - jmp .Le6 - -.L7: mov (%rsi), %r9 - mov 8(%rsi), %r10 - sbb (%rdx), %r9 - sbb 8(%rdx), %r10 - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi - inc %rcx - jmp .Le7 + sbb 8(%rdx), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + adc %eax, %eax + + ret - .align 16, 0x90 -.Ltop: -.Le3: mov %r9, 40(%rdi) -.Le2: mov %r10, 48(%rdi) -.Le1: mov (%rsi), %r8 - mov 8(%rsi), %r9 +.L3: mov 16(%rsi), %r10 sbb (%rdx), %r8 - mov %r11, 56(%rdi) - lea 64(%rdi), %rdi -.Le0: mov 16(%rsi), %r10 sbb 8(%rdx), %r9 sbb 16(%rdx), %r10 mov %r8, (%rdi) -.Le7: mov 24(%rsi), %r11 mov %r9, 8(%rdi) -.Le6: mov 32(%rsi), %r8 - mov 40(%rsi), %r9 - sbb 24(%rdx), %r11 mov %r10, 16(%rdi) -.Le5: sbb 32(%rdx), %r8 - mov %r11, 24(%rdi) -.Le4: mov 48(%rsi), %r10 - mov 56(%rsi), %r11 - mov %r8, 32(%rdi) - lea 64(%rsi), %rsi - sbb 40(%rdx), %r9 - sbb 48(%rdx), %r10 - sbb 56(%rdx), %r11 - lea 64(%rdx), %rdx - dec %rcx - jnz .Ltop - -.Lend: mov %r9, 40(%rdi) - mov %r10, 48(%rdi) - mov %r11, 56(%rdi) - mov %ecx, %eax - adc %ecx, %eax + setc %al ret .align 16, 0x90 -.L3: mov (%rsi), %r9 - mov 8(%rsi), %r10 - mov 16(%rsi), %r11 - sbb (%rdx), %r9 - sbb 8(%rdx), %r10 - sbb 16(%rdx), %r11 - jrcxz .Lx3 - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea -40(%rdi), %rdi - jmp .Le3 -.Lx3: mov %r9, (%rdi) - mov %r10, 8(%rdi) - mov %r11, 16(%rdi) - mov %ecx, %eax - adc %ecx, %eax - - ret +.Ltop: sbb (%rdx), %r8 + sbb 8(%rdx), %r9 + sbb 16(%rdx), %r10 + sbb 24(%rdx), %r11 + mov %r8, (%rdi) + lea 32(%rsi), %rsi + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + dec %rcx + mov %r11, 24(%rdi) + lea 32(%rdx), %rdx + mov (%rsi), %r8 + mov 8(%rsi), %r9 + lea 32(%rdi), %rdi +.Lmid: mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + jnz .Ltop - .align 16, 0x90 -.L1: mov (%rsi), %r11 - sbb (%rdx), %r11 - jrcxz .Lx1 - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea -56(%rdi), %rdi - jmp .Le1 -.Lx1: mov %r11, (%rdi) - mov %ecx, %eax - adc %ecx, %eax - - ret +.Lend: lea 32(%rsi), %rsi + sbb (%rdx), %r8 + sbb 8(%rdx), %r9 + sbb 16(%rdx), %r10 + sbb 24(%rdx), %r11 + lea 32(%rdx), %rdx + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + mov %r11, 24(%rdi) + lea 32(%rdi), %rdi - .align 16, 0x90 -.L2: mov (%rsi), %r10 - mov 8(%rsi), %r11 - sbb (%rdx), %r10 - sbb 8(%rdx), %r11 - jrcxz .Lx2 - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea -48(%rdi), %rdi - jmp .Le2 -.Lx2: mov %r10, (%rdi) - mov %r11, 8(%rdi) - mov %ecx, %eax - adc %ecx, %eax + inc %eax + dec %eax + jnz .Llt4 + adc %eax, %eax ret .size __gmpn_sub_n,.-__gmpn_sub_n - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab - .long .L4-.Ltab - .long .L5-.Ltab - .long .L6-.Ltab - .long .L7-.Ltab diff --git a/ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s b/ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s index cabbb914a0..d257a0544b 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s @@ -58,14 +58,6 @@ - - - - - - - - @@ -76,7 +68,7 @@ .text - .align 8, 0x90 + .align 16, 0x90 .globl __gmpn_sublsh1_n .type __gmpn_sublsh1_n,@function @@ -84,107 +76,100 @@ __gmpn_sublsh1_n: push %rbx - push %r12 + push %rbp + mov (%rdx), %r8 mov %ecx, %eax - lea 24(%rsi,%rcx,8), %rsi - lea 24(%rdx,%rcx,8), %rdx - lea 24(%rdi,%rcx,8), %rdi + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx neg %rcx - - xor %r11d, %r11d - - mov -24(%rdx,%rcx,8), %r8 - shrd $63, %r8, %r11 - + xor %ebp, %ebp and $3, %eax - je .Lb0 + je .Lb00 cmp $2, %eax - jc .Lb1 - je .Lb2 - -.Lb3: mov -16(%rdx,%rcx,8), %r9 - shrd $63, %r9, %r8 - mov -8(%rdx,%rcx,8), %r10 - shrd $63, %r10, %r9 - mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - mov -8(%rsi,%rcx,8), %r12 - sbb %r9, %r12 - mov %r12, -8(%rdi,%rcx,8) - mov %r10, %r11 + jc .Lb01 + je .Lb10 + +.Lb11: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 sbb %eax, %eax + mov (%rsi,%rcx,8), %rbp + mov 8(%rsi,%rcx,8), %rbx + sub %r8, %rbp + sbb %r9, %rbx + mov %rbp, (%rdi,%rcx,8) + mov %rbx, 8(%rdi,%rcx,8) + mov 16(%rsi,%rcx,8), %rbp + sbb %r10, %rbp + mov %rbp, 16(%rdi,%rcx,8) + sbb %ebp, %ebp add $3, %rcx - js .Ltop - jmp .Lend + jmp .Lent -.Lb1: mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov %r8, %r11 - sbb %eax, %eax - inc %rcx - js .Ltop - jmp .Lend - -.Lb2: mov -16(%rdx,%rcx,8), %r9 - shrd $63, %r9, %r8 - mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - mov %r9, %r11 +.Lb10: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 sbb %eax, %eax + mov (%rsi,%rcx,8), %rbp + mov 8(%rsi,%rcx,8), %rbx + sub %r8, %rbp + sbb %r9, %rbx + mov %rbp, (%rdi,%rcx,8) + mov %rbx, 8(%rdi,%rcx,8) + sbb %ebp, %ebp add $2, %rcx - js .Ltop - jmp .Lend + jmp .Lent - .align 16, 0x90 -.Ltop: mov -24(%rdx,%rcx,8), %r8 - shrd $63, %r8, %r11 -.Lb0: mov -16(%rdx,%rcx,8), %r9 - shrd $63, %r9, %r8 - mov -8(%rdx,%rcx,8), %r10 - shrd $63, %r10, %r9 - mov (%rdx,%rcx,8), %rbx - shrd $63, %rbx, %r10 - - add %eax, %eax - - mov -24(%rsi,%rcx,8), %r12 - sbb %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - - mov -8(%rsi,%rcx,8), %r12 - sbb %r9, %r12 - mov %r12, -8(%rdi,%rcx,8) - - mov (%rsi,%rcx,8), %r12 - sbb %r10, %r12 - mov %r12, (%rdi,%rcx,8) - - mov %rbx, %r11 +.Lb01: add %r8, %r8 sbb %eax, %eax + mov (%rsi,%rcx,8), %rbp + sub %r8, %rbp + mov %rbp, (%rdi,%rcx,8) + sbb %ebp, %ebp + inc %rcx +.Lent: jns .Lend + + .align 16, 0x90 +.Ltop: add %eax, %eax + mov (%rdx,%rcx,8), %r8 +.Lb00: adc %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 + mov 24(%rdx,%rcx,8), %r11 + adc %r11, %r11 + + sbb %eax, %eax + add %ebp, %ebp + + mov (%rsi,%rcx,8), %rbp + mov 8(%rsi,%rcx,8), %rbx + sbb %r8, %rbp + sbb %r9, %rbx + mov %rbp, (%rdi,%rcx,8) + mov %rbx, 8(%rdi,%rcx,8) + mov 16(%rsi,%rcx,8), %rbp + mov 24(%rsi,%rcx,8), %rbx + sbb %r10, %rbp + sbb %r11, %rbx + mov %rbp, 16(%rdi,%rcx,8) + mov %rbx, 24(%rdi,%rcx,8) + + sbb %ebp, %ebp add $4, %rcx js .Ltop -.Lend: shr $63, %r11 - pop %r12 - pop %rbx - sub %r11d, %eax +.Lend: add %ebp, %eax neg %eax + + pop %rbp + pop %rbx ret .size __gmpn_sublsh1_n,.-__gmpn_sublsh1_n - diff --git a/ext/gmp/gen/x86_64-linux/mpn/sublsh2_n.s b/ext/gmp/gen/x86_64-linux/mpn/sublsh2_n.s deleted file mode 100644 index d5bf3a7be3..0000000000 --- a/ext/gmp/gen/x86_64-linux/mpn/sublsh2_n.s +++ /dev/null @@ -1,190 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - .text - .align 8, 0x90 - .globl __gmpn_sublsh2_n - .type __gmpn_sublsh2_n,@function - -__gmpn_sublsh2_n: - - - push %rbx - push %r12 - - mov %ecx, %eax - lea 24(%rsi,%rcx,8), %rsi - lea 24(%rdx,%rcx,8), %rdx - lea 24(%rdi,%rcx,8), %rdi - neg %rcx - - xor %r11d, %r11d - - mov -24(%rdx,%rcx,8), %r8 - shrd $62, %r8, %r11 - - and $3, %eax - je .Lb0 - cmp $2, %eax - jc .Lb1 - je .Lb2 - -.Lb3: mov -16(%rdx,%rcx,8), %r9 - shrd $62, %r9, %r8 - mov -8(%rdx,%rcx,8), %r10 - shrd $62, %r10, %r9 - mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - mov -8(%rsi,%rcx,8), %r12 - sbb %r9, %r12 - mov %r12, -8(%rdi,%rcx,8) - mov %r10, %r11 - sbb %eax, %eax - add $3, %rcx - js .Ltop - jmp .Lend - -.Lb1: mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov %r8, %r11 - sbb %eax, %eax - inc %rcx - js .Ltop - jmp .Lend - -.Lb2: mov -16(%rdx,%rcx,8), %r9 - shrd $62, %r9, %r8 - mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - mov %r9, %r11 - sbb %eax, %eax - add $2, %rcx - js .Ltop - jmp .Lend - - .align 16, 0x90 -.Ltop: mov -24(%rdx,%rcx,8), %r8 - shrd $62, %r8, %r11 -.Lb0: mov -16(%rdx,%rcx,8), %r9 - shrd $62, %r9, %r8 - mov -8(%rdx,%rcx,8), %r10 - shrd $62, %r10, %r9 - mov (%rdx,%rcx,8), %rbx - shrd $62, %rbx, %r10 - - add %eax, %eax - - mov -24(%rsi,%rcx,8), %r12 - sbb %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - - mov -8(%rsi,%rcx,8), %r12 - sbb %r9, %r12 - mov %r12, -8(%rdi,%rcx,8) - - mov (%rsi,%rcx,8), %r12 - sbb %r10, %r12 - mov %r12, (%rdi,%rcx,8) - - mov %rbx, %r11 - sbb %eax, %eax - - add $4, %rcx - js .Ltop - -.Lend: shr $62, %r11 - pop %r12 - pop %rbx - sub %r11d, %eax - neg %eax - - ret - .size __gmpn_sublsh2_n,.-__gmpn_sublsh2_n - diff --git a/ext/gmp/gen/x86_64-linux/mpn/submul_1.s b/ext/gmp/gen/x86_64-linux/mpn/submul_1.s index 07aaadb7bb..5e34932b8d 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/submul_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/submul_1.s @@ -68,6 +68,7 @@ + @@ -78,10 +79,8 @@ - - - - + + @@ -89,6 +88,7 @@ + .text .align 16, 0x90 @@ -97,115 +97,100 @@ __gmpn_submul_1: - + + + + + + mov (%rsi), %rax push %rbx - push %rbp - push %r12 - push %r13 - - mov %rdx, %rbp - mov %rcx, %rdx - - test $1, %bpl - jnz .Lbx1 - -.Lbx0: shr $2, %rbp - jc .Lb10 - -.Lb00: .byte 0xc4,98,147,0xf6,38 - .byte 0xc4,226,227,0xf6,70,8 - add %r12, %rbx - adc $0, %rax - mov (%rdi), %r12 - mov 8(%rdi), %rcx - .byte 0xc4,98,179,0xf6,70,16 - lea -16(%rdi), %rdi - lea 16(%rsi), %rsi - sub %r13, %r12 - jmp .Llo0 - -.Lbx1: shr $2, %rbp - jc .Lb11 - -.Lb01: .byte 0xc4,98,163,0xf6,22 - jnz .Lgt1 -.Ln1: sub %r11, (%rdi) - mov $0, %eax - adc %r10, %rax - jmp .Lret + mov %rdx, %rbx + + mul %rcx + mov %rbx, %r11 -.Lgt1: .byte 0xc4,98,147,0xf6,102,8 - .byte 0xc4,226,227,0xf6,70,16 - lea 24(%rsi), %rsi - add %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov (%rdi), %r10 - mov 8(%rdi), %r12 - mov 16(%rdi), %rcx - lea -8(%rdi), %rdi - sub %r11, %r10 - jmp .Llo1 - -.Lb11: .byte 0xc4,226,227,0xf6,6 - mov (%rdi), %rcx - .byte 0xc4,98,179,0xf6,70,8 - lea 8(%rsi), %rsi - lea -24(%rdi), %rdi - inc %rbp - sub %rbx, %rcx - jmp .Llo3 - -.Lb10: .byte 0xc4,98,179,0xf6,6 - .byte 0xc4,98,163,0xf6,86,8 - lea -32(%rdi), %rdi - mov $0, %eax - clc - jz .Lend + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jz .Lb2 + jg .Lb3 + +.Lb1: dec %r11 + jne .Lgt1 + sub %rax, (%rdi) + jmp .Lret +.Lgt1: lea 8(%rsi,%r11,8), %rsi + lea -8(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (%rsi,%r11,8), %rax + mov %rdx, %r8 + jmp .LL1 + +.Lb0: lea (%rsi,%r11,8), %rsi + lea -16(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp .LL0 + +.Lb3: lea -8(%rsi,%r11,8), %rsi + lea -24(%rdi,%r11,8), %rdi + neg %r11 + mov %rax, %rbx + mov %rdx, %r10 + jmp .LL3 + +.Lb2: lea -16(%rsi,%r11,8), %rsi + lea -32(%rdi,%r11,8), %rdi + neg %r11 + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %rdx, %r9 + jmp .LL2 .align 16, 0x90 -.Ltop: adc %rax, %r9 - lea 32(%rdi), %rdi - adc %r8, %r11 - .byte 0xc4,98,147,0xf6,102,16 - mov (%rdi), %r8 - .byte 0xc4,226,227,0xf6,70,24 - lea 32(%rsi), %rsi - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov 8(%rdi), %r10 - mov 16(%rdi), %r12 - sub %r9, %r8 - mov 24(%rdi), %rcx - mov %r8, (%rdi) - sbb %r11, %r10 -.Llo1: .byte 0xc4,98,179,0xf6,6 - mov %r10, 8(%rdi) - sbb %r13, %r12 -.Llo0: mov %r12, 16(%rdi) - sbb %rbx, %rcx -.Llo3: .byte 0xc4,98,163,0xf6,86,8 - mov %rcx, 24(%rdi) - dec %rbp - jnz .Ltop - -.Lend: adc %rax, %r9 - adc %r8, %r11 - mov 32(%rdi), %r8 - mov %r10, %rax - adc $0, %rax - mov 40(%rdi), %r10 - sub %r9, %r8 - mov %r8, 32(%rdi) - sbb %r11, %r10 - mov %r10, 40(%rdi) - adc $0, %rax - -.Lret: pop %r13 - pop %r12 - pop %rbp +.Ltop: sub %r10, (%rdi,%r11,8) + adc %rax, %r9 + mov (%rsi,%r11,8), %rax + adc %rdx, %r8 + mov $0, %r10d +.LL1: mul %rcx + sub %r9, 8(%rdi,%r11,8) + adc %rax, %r8 + adc %rdx, %rbx +.LL0: mov 8(%rsi,%r11,8), %rax + mul %rcx + sub %r8, 16(%rdi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 +.LL3: mov 16(%rsi,%r11,8), %rax + mul %rcx + sub %rbx, 24(%rdi,%r11,8) + mov $0, %r8d + mov %r8, %rbx + adc %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %r8, %r9 + adc %rdx, %r9 +.LL2: mul %rcx + add $4, %r11 + js .Ltop + + sub %r10, (%rdi,%r11,8) + adc %rax, %r9 + adc %r8, %rdx + sub %r9, 8(%rdi,%r11,8) +.Lret: adc $0, %rdx + mov %rdx, %rax + pop %rbx - + + ret .size __gmpn_submul_1,.-__gmpn_submul_1 diff --git a/ext/gmp/gen/x86_64-linux/mpn/xnor_n.s b/ext/gmp/gen/x86_64-linux/mpn/xnor_n.s index 360b9b8869..4db0497767 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/xnor_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/xnor_n.s @@ -92,7 +92,6 @@ - .text @@ -106,54 +105,46 @@ __gmpn_xnor_n: mov (%rdx), %r8 not %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: xor (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: xor (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 +.Ltop: mov (%rdx,%rcx,8), %r8 not %r8 -.Lb00: mov 8(%rdx), %r9 +.Lb00: mov 8(%rdx,%rcx,8), %r9 not %r9 - xor (%rsi), %r8 - xor 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 + xor (%rsi,%rcx,8), %r8 + xor 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 not %r8 -.Le10: mov 24(%rdx), %r9 +.Le10: mov 24(%rdx,%rcx,8), %r9 not %r9 - lea 32(%rdx), %rdx - xor 16(%rsi), %r8 - xor 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + xor 16(%rsi,%rcx,8), %r8 + xor 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/xor_n.s b/ext/gmp/gen/x86_64-linux/mpn/xor_n.s index 6889f2720a..8ef14d059c 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/xor_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/xor_n.s @@ -90,7 +90,6 @@ - .text @@ -103,50 +102,42 @@ __gmpn_xor_n: mov (%rdx), %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: xor (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: xor (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx +.Lb01: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 -.Lb00: mov 8(%rdx), %r9 - xor (%rsi), %r8 - xor 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 -.Le10: mov 24(%rdx), %r9 - lea 32(%rdx), %rdx - xor 16(%rsi), %r8 - xor 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + xor (%rsi,%rcx,8), %r8 + xor 8(%rsi,%rcx,8), %r9 + nop + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + xor 16(%rsi,%rcx,8), %r8 + xor 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret From ec02731ceab9e3b69e4479de11a167e104ed53c7 Mon Sep 17 00:00:00 2001 From: Santeri Hannula Date: Thu, 10 Oct 2024 18:42:35 +0300 Subject: [PATCH 87/97] ext: add generated gmp config header for each target --- ext/gmp/build.zig | 323 -------------- ext/gmp/gen/aarch64-linux/config.h | 668 +++++++++++++++++++++++++++++ ext/gmp/gen/aarch64-macos/config.h | 668 +++++++++++++++++++++++++++++ ext/gmp/gen/x86_64-linux/config.h | 668 +++++++++++++++++++++++++++++ ext/gmp/gen/x86_64-macos/config.h | 668 +++++++++++++++++++++++++++++ 5 files changed, 2672 insertions(+), 323 deletions(-) create mode 100644 ext/gmp/gen/aarch64-linux/config.h create mode 100644 ext/gmp/gen/aarch64-macos/config.h create mode 100644 ext/gmp/gen/x86_64-linux/config.h create mode 100644 ext/gmp/gen/x86_64-macos/config.h diff --git a/ext/gmp/build.zig b/ext/gmp/build.zig index cdfad41394..2444000e3d 100644 --- a/ext/gmp/build.zig +++ b/ext/gmp/build.zig @@ -18,328 +18,6 @@ pub fn build(b: *std.Build) void { lib.linkLibC(); - // TODO: The values here should be provided programmatically - const config_h = b.addConfigHeader(.{ - .style = .{ - .autoconf = dep_c.path("config.in"), - }, - .include_path = "config.h", - }, .{ - .GMP_MPARAM_H_SUGGEST = "./mpn/arm64/gmp-mparam.h", - .HAVE_ALARM = 1, - .HAVE_ALLOCA = 1, - .HAVE_ALLOCA_H = 1, - .HAVE_ATTRIBUTE_CONST = 1, - .HAVE_ATTRIBUTE_MALLOC = 1, - .HAVE_ATTRIBUTE_MODE = 1, - .HAVE_ATTRIBUTE_NORETURN = 1, - .HAVE_CLOCK = 1, - .HAVE_CLOCK_GETTIME = 1, - .HAVE_DECL_FGETC = 1, - .HAVE_DECL_FSCANF = 1, - .HAVE_DECL_OPTARG = 1, - .HAVE_DECL_SYS_ERRLIST = 1, - .HAVE_DECL_SYS_NERR = 1, - .HAVE_DECL_UNGETC = 1, - .HAVE_DECL_VFPRINTF = 1, - .HAVE_DLFCN_H = 1, - .HAVE_DOUBLE_IEEE_LITTLE_ENDIAN = 1, - .HAVE_FCNTL_H = 1, - .HAVE_FLOAT_H = 1, - .HAVE_GETPAGESIZE = 1, - .HAVE_GETRUSAGE = 1, - .HAVE_GETTIMEOFDAY = 1, - .HAVE_INTMAX_T = 1, - .HAVE_INTPTR_T = 1, - .HAVE_INTTYPES_H = 1, - .HAVE_LANGINFO_H = 1, - .HAVE_LIMB_LITTLE_ENDIAN = 1, - .HAVE_LOCALECONV = 1, - .HAVE_LOCALE_H = 1, - .HAVE_LONG_DOUBLE = 1, - .HAVE_LONG_LONG = 1, - .HAVE_MEMORY_H = 1, - .HAVE_MEMSET = 1, - .HAVE_MMAP = 1, - .HAVE_MPROTECT = 1, - .HAVE_NATIVE_mpn_add_n = 1, - .HAVE_NATIVE_mpn_add_nc = 1, - .HAVE_NATIVE_mpn_addlsh1_n = 1, - .HAVE_NATIVE_mpn_addlsh2_n = 1, - .HAVE_NATIVE_mpn_and_n = 1, - .HAVE_NATIVE_mpn_andn_n = 1, - .HAVE_NATIVE_mpn_bdiv_dbm1c = 1, - .HAVE_NATIVE_mpn_bdiv_q_1 = 1, - .HAVE_NATIVE_mpn_pi1_bdiv_q_1 = 1, - .HAVE_NATIVE_mpn_cnd_add_n = 1, - .HAVE_NATIVE_mpn_cnd_sub_n = 1, - .HAVE_NATIVE_mpn_com = 1, - .HAVE_NATIVE_mpn_copyd = 1, - .HAVE_NATIVE_mpn_copyi = 1, - .HAVE_NATIVE_mpn_gcd_11 = 1, - .HAVE_NATIVE_mpn_gcd_22 = 1, - .HAVE_NATIVE_mpn_hamdist = 1, - .HAVE_NATIVE_mpn_invert_limb = 1, - .HAVE_NATIVE_mpn_ior_n = 1, - .HAVE_NATIVE_mpn_iorn_n = 1, - .HAVE_NATIVE_mpn_lshift = 1, - .HAVE_NATIVE_mpn_lshiftc = 1, - .HAVE_NATIVE_mpn_mod_34lsub1 = 1, - .HAVE_NATIVE_mpn_mul_1 = 1, - .HAVE_NATIVE_mpn_mul_1c = 1, - .HAVE_NATIVE_mpn_nand_n = 1, - .HAVE_NATIVE_mpn_nior_n = 1, - .HAVE_NATIVE_mpn_popcount = 1, - .HAVE_NATIVE_mpn_rsblsh1_n = 1, - .HAVE_NATIVE_mpn_rsblsh2_n = 1, - .HAVE_NATIVE_mpn_rsh1add_n = 1, - .HAVE_NATIVE_mpn_rsh1sub_n = 1, - .HAVE_NATIVE_mpn_rshift = 1, - .HAVE_NATIVE_mpn_sqr_diag_addlsh1 = 1, - .HAVE_NATIVE_mpn_sub_n = 1, - .HAVE_NATIVE_mpn_sub_nc = 1, - .HAVE_NATIVE_mpn_sublsh1_n = 1, - .HAVE_NATIVE_mpn_sublsh2_n = 1, - .HAVE_NATIVE_mpn_xor_n = 1, - .HAVE_NATIVE_mpn_xnor_n = 1, - .HAVE_NL_LANGINFO = 1, - .HAVE_NL_TYPES_H = 1, - .HAVE_POPEN = 1, - .HAVE_PROCESSOR_INFO = 1, - .HAVE_PTRDIFF_T = 1, - .HAVE_QUAD_T = 1, - .HAVE_RAISE = 1, - .HAVE_SIGACTION = 1, - .HAVE_SIGALTSTACK = 1, - .HAVE_STACK_T = 1, - .HAVE_STDINT_H = 1, - .HAVE_STDLIB_H = 1, - .HAVE_STRCHR = 1, - .HAVE_STRERROR = 1, - .HAVE_STRINGS_H = 1, - .HAVE_STRING_H = 1, - .HAVE_STRNLEN = 1, - .HAVE_STRTOL = 1, - .HAVE_STRTOUL = 1, - .HAVE_SYSCONF = 1, - .HAVE_SYSCTL = 1, - .HAVE_SYSCTLBYNAME = 1, - .HAVE_SYS_MMAN_H = 1, - .HAVE_SYS_PARAM_H = 1, - .HAVE_SYS_RESOURCE_H = 1, - .HAVE_SYS_STAT_H = 1, - .HAVE_SYS_SYSCTL_H = 1, - .HAVE_SYS_TIMES_H = 1, - .HAVE_SYS_TIME_H = 1, - .HAVE_SYS_TYPES_H = 1, - .HAVE_TIMES = 1, - .HAVE_UINT_LEAST32_T = 1, - .HAVE_UNISTD_H = 1, - .HAVE_VSNPRINTF = 1, - .LSYM_PREFIX = "L", - .LT_OBJDIR = ".libs/", - .PACKAGE = "gmp", - .PACKAGE_BUGREPORT = "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html", - .PACKAGE_NAME = "GNU MP", - .PACKAGE_STRING = "GNU MP 6.2.1", - .PACKAGE_TARNAME = "gmp", - .PACKAGE_URL = "http://www.gnu.org/software/gmp/", - .PACKAGE_VERSION = "6.2.1", - .RETSIGTYPE = null, - .SIZEOF_MP_LIMB_T = 8, - .SIZEOF_UNSIGNED = 4, - .SIZEOF_UNSIGNED_LONG = 8, - .SIZEOF_UNSIGNED_SHORT = 2, - .SIZEOF_VOID_P = 8, - .STDC_HEADERS = 1, - .TIME_WITH_SYS_TIME = 1, - .TUNE_SQR_TOOM2_MAX = "SQR_TOOM2_MAX_GENERIC", - .VERSION = "6.2.1", - .WANT_FFT = 1, - .WANT_TMP_ALLOCA = 1, - .YYTEXT_POINTER = 1, - .restrict = .__restrict, - .AC_APPLE_UNIVERSAL_BUILD = null, - .HAVE_ATTR_GET = null, - .HAVE_CALLING_CONVENTIONS = null, - .HAVE_CPUTIME = null, - .HAVE_DOUBLE_IEEE_BIG_ENDIAN = null, - .HAVE_DOUBLE_IEEE_LITTLE_SWAPPED = null, - .HAVE_DOUBLE_VAX_D = null, - .HAVE_DOUBLE_VAX_G = null, - .HAVE_DOUBLE_CRAY_CFP = null, - .HAVE_GETSYSINFO = null, - .HAVE_HIDDEN_ALIAS = null, - .HAVE_HOST_CPU_FAMILY_alpha = null, - .HAVE_HOST_CPU_FAMILY_m68k = null, - .HAVE_HOST_CPU_FAMILY_power = null, - .HAVE_HOST_CPU_FAMILY_powerpc = null, - .HAVE_HOST_CPU_FAMILY_x86 = null, - .HAVE_HOST_CPU_FAMILY_x86_64 = null, - .HAVE_HOST_CPU_alphaev67 = null, - .HAVE_HOST_CPU_alphaev68 = null, - .HAVE_HOST_CPU_alphaev7 = null, - .HAVE_HOST_CPU_m68020 = null, - .HAVE_HOST_CPU_m68030 = null, - .HAVE_HOST_CPU_m68040 = null, - .HAVE_HOST_CPU_m68060 = null, - .HAVE_HOST_CPU_m68360 = null, - .HAVE_HOST_CPU_powerpc604 = null, - .HAVE_HOST_CPU_powerpc604e = null, - .HAVE_HOST_CPU_powerpc750 = null, - .HAVE_HOST_CPU_powerpc7400 = null, - .HAVE_HOST_CPU_supersparc = null, - .HAVE_HOST_CPU_i386 = null, - .HAVE_HOST_CPU_i586 = null, - .HAVE_HOST_CPU_i686 = null, - .HAVE_HOST_CPU_pentium = null, - .HAVE_HOST_CPU_pentiummmx = null, - .HAVE_HOST_CPU_pentiumpro = null, - .HAVE_HOST_CPU_pentium2 = null, - .HAVE_HOST_CPU_pentium3 = null, - .HAVE_HOST_CPU_pentium4 = null, - .HAVE_HOST_CPU_core2 = null, - .HAVE_HOST_CPU_nehalem = null, - .HAVE_HOST_CPU_westmere = null, - .HAVE_HOST_CPU_sandybridge = null, - .HAVE_HOST_CPU_ivybridge = null, - .HAVE_HOST_CPU_haswell = null, - .HAVE_HOST_CPU_broadwell = null, - .HAVE_HOST_CPU_skylake = null, - .HAVE_HOST_CPU_silvermont = null, - .HAVE_HOST_CPU_goldmont = null, - .HAVE_HOST_CPU_k8 = null, - .HAVE_HOST_CPU_k10 = null, - .HAVE_HOST_CPU_bulldozer = null, - .HAVE_HOST_CPU_piledriver = null, - .HAVE_HOST_CPU_steamroller = null, - .HAVE_HOST_CPU_excavator = null, - .HAVE_HOST_CPU_zen = null, - .HAVE_HOST_CPU_bobcat = null, - .HAVE_HOST_CPU_jaguar = null, - .HAVE_HOST_CPU_s390_z900 = null, - .HAVE_HOST_CPU_s390_z990 = null, - .HAVE_HOST_CPU_s390_z9 = null, - .HAVE_HOST_CPU_s390_z10 = null, - .HAVE_HOST_CPU_s390_z196 = null, - .HAVE_HOST_CPU_s390_zarch = null, - .HAVE_INVENT_H = null, - .HAVE_LIMB_BIG_ENDIAN = null, - .HAVE_MACHINE_HAL_SYSINFO_H = null, - .HAVE_NATIVE_mpn_add_n_sub_n = null, - .HAVE_NATIVE_mpn_addaddmul_1msb0 = null, - .HAVE_NATIVE_mpn_addlsh_n = null, - .HAVE_NATIVE_mpn_addlsh1_nc = null, - .HAVE_NATIVE_mpn_addlsh2_nc = null, - .HAVE_NATIVE_mpn_addlsh_nc = null, - .HAVE_NATIVE_mpn_addlsh1_n_ip1 = null, - .HAVE_NATIVE_mpn_addlsh2_n_ip1 = null, - .HAVE_NATIVE_mpn_addlsh_n_ip1 = null, - .HAVE_NATIVE_mpn_addlsh1_nc_ip1 = null, - .HAVE_NATIVE_mpn_addlsh2_nc_ip1 = null, - .HAVE_NATIVE_mpn_addlsh_nc_ip1 = null, - .HAVE_NATIVE_mpn_addlsh1_n_ip2 = null, - .HAVE_NATIVE_mpn_addlsh2_n_ip2 = null, - .HAVE_NATIVE_mpn_addlsh_n_ip2 = null, - .HAVE_NATIVE_mpn_addlsh1_nc_ip2 = null, - .HAVE_NATIVE_mpn_addlsh2_nc_ip2 = null, - .HAVE_NATIVE_mpn_addlsh_nc_ip2 = null, - .HAVE_NATIVE_mpn_addmul_1c = null, - .HAVE_NATIVE_mpn_addmul_2 = null, - .HAVE_NATIVE_mpn_addmul_3 = null, - .HAVE_NATIVE_mpn_addmul_4 = null, - .HAVE_NATIVE_mpn_addmul_5 = null, - .HAVE_NATIVE_mpn_addmul_6 = null, - .HAVE_NATIVE_mpn_addmul_7 = null, - .HAVE_NATIVE_mpn_addmul_8 = null, - .HAVE_NATIVE_mpn_addmul_2s = null, - .HAVE_NATIVE_mpn_div_qr_1n_pi1 = null, - .HAVE_NATIVE_mpn_div_qr_2 = null, - .HAVE_NATIVE_mpn_divexact_1 = null, - .HAVE_NATIVE_mpn_divexact_by3c = null, - .HAVE_NATIVE_mpn_divrem_1 = null, - .HAVE_NATIVE_mpn_divrem_1c = null, - .HAVE_NATIVE_mpn_divrem_2 = null, - .HAVE_NATIVE_mpn_gcd_1 = null, - .HAVE_NATIVE_mpn_lshsub_n = null, - .HAVE_NATIVE_mpn_mod_1 = null, - .HAVE_NATIVE_mpn_mod_1_1p = null, - .HAVE_NATIVE_mpn_mod_1c = null, - .HAVE_NATIVE_mpn_mod_1s_2p = null, - .HAVE_NATIVE_mpn_mod_1s_4p = null, - .HAVE_NATIVE_mpn_modexact_1_odd = null, - .HAVE_NATIVE_mpn_modexact_1c_odd = null, - .HAVE_NATIVE_mpn_mul_2 = null, - .HAVE_NATIVE_mpn_mul_3 = null, - .HAVE_NATIVE_mpn_mul_4 = null, - .HAVE_NATIVE_mpn_mul_5 = null, - .HAVE_NATIVE_mpn_mul_6 = null, - .HAVE_NATIVE_mpn_mul_basecase = null, - .HAVE_NATIVE_mpn_mullo_basecase = null, - .HAVE_NATIVE_mpn_preinv_divrem_1 = null, - .HAVE_NATIVE_mpn_preinv_mod_1 = null, - .HAVE_NATIVE_mpn_redc_1 = null, - .HAVE_NATIVE_mpn_redc_2 = null, - .HAVE_NATIVE_mpn_rsblsh_n = null, - .HAVE_NATIVE_mpn_rsblsh1_nc = null, - .HAVE_NATIVE_mpn_rsblsh2_nc = null, - .HAVE_NATIVE_mpn_rsblsh_nc = null, - .HAVE_NATIVE_mpn_rsh1add_nc = null, - .HAVE_NATIVE_mpn_rsh1sub_nc = null, - .HAVE_NATIVE_mpn_sbpi1_bdiv_r = null, - .HAVE_NATIVE_mpn_sqr_basecase = null, - .HAVE_NATIVE_mpn_sqr_diagonal = null, - .HAVE_NATIVE_mpn_sublsh_n = null, - .HAVE_NATIVE_mpn_sublsh1_nc = null, - .HAVE_NATIVE_mpn_sublsh2_nc = null, - .HAVE_NATIVE_mpn_sublsh_nc = null, - .HAVE_NATIVE_mpn_sublsh1_n_ip1 = null, - .HAVE_NATIVE_mpn_sublsh2_n_ip1 = null, - .HAVE_NATIVE_mpn_sublsh_n_ip1 = null, - .HAVE_NATIVE_mpn_sublsh1_nc_ip1 = null, - .HAVE_NATIVE_mpn_sublsh2_nc_ip1 = null, - .HAVE_NATIVE_mpn_sublsh_nc_ip1 = null, - .HAVE_NATIVE_mpn_submul_1c = null, - .HAVE_NATIVE_mpn_tabselect = null, - .HAVE_NATIVE_mpn_udiv_qrnnd = null, - .HAVE_NATIVE_mpn_udiv_qrnnd_r = null, - .HAVE_NATIVE_mpn_umul_ppmm = null, - .HAVE_NATIVE_mpn_umul_ppmm_r = null, - .HAVE_OBSTACK_VPRINTF = null, - .HAVE_PSP_ITICKSPERCLKTICK = null, - .HAVE_PSTAT_GETPROCESSOR = null, - .HAVE_READ_REAL_TIME = null, - .HAVE_SIGSTACK = null, - .HAVE_SPEED_CYCLECOUNTER = null, - .HAVE_SSTREAM = null, - .HAVE_STD__LOCALE = null, - .HAVE_SYSSGI = null, - .HAVE_SYS_ATTRIBUTES_H = null, - .HAVE_SYS_IOGRAPH_H = null, - .HAVE_SYS_PROCESSOR_H = null, - .HAVE_SYS_PSTAT_H = null, - .HAVE_SYS_SYSINFO_H = null, - .HAVE_SYS_SYSSGI_H = null, - .HAVE_SYS_SYSTEMCFG_H = null, - .HOST_DOS64 = null, - .NO_ASM = null, - .SSCANF_WRITABLE_INPUT = null, - .WANT_ASSERT = null, - .WANT_FAKE_CPUID = null, - .WANT_FAT_BINARY = null, - .WANT_OLD_FFT_FULL = null, - .WANT_PROFILING_GPROF = null, - .WANT_PROFILING_INSTRUMENT = null, - .WANT_PROFILING_PROF = null, - .WANT_TMP_REENTRANT = null, - .WANT_TMP_NOTREENTRANT = null, - .WANT_TMP_DEBUG = null, - .WORDS_BIGENDIAN = null, - .X86_ASM_MULX = null, - .@"inline" = null, - .@"volatile" = null, - }); - // TODO: Finish this const gmp_h = b.addConfigHeader(.{ .style = .{ @@ -357,7 +35,6 @@ pub fn build(b: *std.Build) void { .CFLAGS = "-O2 -pedantic -march=armv8-a", }); - lib.addConfigHeader(config_h); lib.addConfigHeader(gmp_h); // Static headers diff --git a/ext/gmp/gen/aarch64-linux/config.h b/ext/gmp/gen/aarch64-linux/config.h new file mode 100644 index 0000000000..d2e56c8c54 --- /dev/null +++ b/ext/gmp/gen/aarch64-linux/config.h @@ -0,0 +1,668 @@ +/* config.h. Generated from config.in by configure. */ +/* config.in. Generated from configure.ac by autoheader. */ + +/* + +Copyright 1996-2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. +*/ + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* The gmp-mparam.h file (a string) the tune program should suggest updating. + */ +#define GMP_MPARAM_H_SUGGEST "./mpn/arm64/gmp-mparam.h" + +/* Define to 1 if you have the `alarm' function. */ +#define HAVE_ALARM 1 + +/* Define to 1 if alloca() works (via gmp-impl.h). */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#define HAVE_ALLOCA_H 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((const)) */ +#define HAVE_ATTRIBUTE_CONST 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((malloc)) */ +#define HAVE_ATTRIBUTE_MALLOC 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((mode (XX))) + */ +#define HAVE_ATTRIBUTE_MODE 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((noreturn)) */ +#define HAVE_ATTRIBUTE_NORETURN 1 + +/* Define to 1 if you have the `attr_get' function. */ +/* #undef HAVE_ATTR_GET */ + +/* Define to 1 if tests/libtests has calling conventions checking for the CPU + */ +/* #undef HAVE_CALLING_CONVENTIONS */ + +/* Define to 1 if you have the `clock' function. */ +#define HAVE_CLOCK 1 + +/* Define to 1 if you have the `clock_gettime' function */ +#define HAVE_CLOCK_GETTIME 1 + +/* Define to 1 if you have the `cputime' function. */ +/* #undef HAVE_CPUTIME */ + +/* Define to 1 if you have the declaration of `fgetc', and to 0 if you don't. + */ +#define HAVE_DECL_FGETC 1 + +/* Define to 1 if you have the declaration of `fscanf', and to 0 if you don't. + */ +#define HAVE_DECL_FSCANF 1 + +/* Define to 1 if you have the declaration of `optarg', and to 0 if you don't. + */ +#define HAVE_DECL_OPTARG 1 + +/* Define to 1 if you have the declaration of `sys_errlist', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_ERRLIST 0 + +/* Define to 1 if you have the declaration of `sys_nerr', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_NERR 0 + +/* Define to 1 if you have the declaration of `ungetc', and to 0 if you don't. + */ +#define HAVE_DECL_UNGETC 1 + +/* Define to 1 if you have the declaration of `vfprintf', and to 0 if you + don't. */ +#define HAVE_DECL_VFPRINTF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define one of the following to 1 for the format of a `double'. + If your format is not among these choices, or you don't know what it is, + then leave all undefined. + IEEE_LITTLE_SWAPPED means little endian, but with the two 4-byte halves + swapped, as used by ARM CPUs in little endian mode. */ +/* #undef HAVE_DOUBLE_IEEE_BIG_ENDIAN */ +#define HAVE_DOUBLE_IEEE_LITTLE_ENDIAN 1 +/* #undef HAVE_DOUBLE_IEEE_LITTLE_SWAPPED */ +/* #undef HAVE_DOUBLE_VAX_D */ +/* #undef HAVE_DOUBLE_VAX_G */ +/* #undef HAVE_DOUBLE_CRAY_CFP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FLOAT_H 1 + +/* Define to 1 if you have the `getpagesize' function. */ +#define HAVE_GETPAGESIZE 1 + +/* Define to 1 if you have the `getrusage' function. */ +#define HAVE_GETRUSAGE 1 + +/* Define to 1 if you have the `getsysinfo' function. */ +/* #undef HAVE_GETSYSINFO */ + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((visibility)) + and __attribute__ ((alias)) */ +#define HAVE_HIDDEN_ALIAS 1 + +/* Define one of these to 1 for the host CPU family. + If your CPU is not in any of these families, leave all undefined. + For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */ +/* #undef HAVE_HOST_CPU_FAMILY_alpha */ +/* #undef HAVE_HOST_CPU_FAMILY_m68k */ +/* #undef HAVE_HOST_CPU_FAMILY_power */ +/* #undef HAVE_HOST_CPU_FAMILY_powerpc */ +/* #undef HAVE_HOST_CPU_FAMILY_x86 */ +/* #undef HAVE_HOST_CPU_FAMILY_x86_64 */ + +/* Define one of the following to 1 for the host CPU, as per the output of + ./config.guess. If your CPU is not listed here, leave all undefined. */ +/* #undef HAVE_HOST_CPU_alphaev67 */ +/* #undef HAVE_HOST_CPU_alphaev68 */ +/* #undef HAVE_HOST_CPU_alphaev7 */ +/* #undef HAVE_HOST_CPU_m68020 */ +/* #undef HAVE_HOST_CPU_m68030 */ +/* #undef HAVE_HOST_CPU_m68040 */ +/* #undef HAVE_HOST_CPU_m68060 */ +/* #undef HAVE_HOST_CPU_m68360 */ +/* #undef HAVE_HOST_CPU_powerpc604 */ +/* #undef HAVE_HOST_CPU_powerpc604e */ +/* #undef HAVE_HOST_CPU_powerpc750 */ +/* #undef HAVE_HOST_CPU_powerpc7400 */ +/* #undef HAVE_HOST_CPU_supersparc */ +/* #undef HAVE_HOST_CPU_i386 */ +/* #undef HAVE_HOST_CPU_i586 */ +/* #undef HAVE_HOST_CPU_i686 */ +/* #undef HAVE_HOST_CPU_pentium */ +/* #undef HAVE_HOST_CPU_pentiummmx */ +/* #undef HAVE_HOST_CPU_pentiumpro */ +/* #undef HAVE_HOST_CPU_pentium2 */ +/* #undef HAVE_HOST_CPU_pentium3 */ +/* #undef HAVE_HOST_CPU_pentium4 */ +/* #undef HAVE_HOST_CPU_core2 */ +/* #undef HAVE_HOST_CPU_nehalem */ +/* #undef HAVE_HOST_CPU_westmere */ +/* #undef HAVE_HOST_CPU_sandybridge */ +/* #undef HAVE_HOST_CPU_ivybridge */ +/* #undef HAVE_HOST_CPU_haswell */ +/* #undef HAVE_HOST_CPU_broadwell */ +/* #undef HAVE_HOST_CPU_skylake */ +/* #undef HAVE_HOST_CPU_silvermont */ +/* #undef HAVE_HOST_CPU_goldmont */ +/* #undef HAVE_HOST_CPU_k8 */ +/* #undef HAVE_HOST_CPU_k10 */ +/* #undef HAVE_HOST_CPU_bulldozer */ +/* #undef HAVE_HOST_CPU_piledriver */ +/* #undef HAVE_HOST_CPU_steamroller */ +/* #undef HAVE_HOST_CPU_excavator */ +/* #undef HAVE_HOST_CPU_zen */ +/* #undef HAVE_HOST_CPU_bobcat */ +/* #undef HAVE_HOST_CPU_jaguar */ +/* #undef HAVE_HOST_CPU_s390_z900 */ +/* #undef HAVE_HOST_CPU_s390_z990 */ +/* #undef HAVE_HOST_CPU_s390_z9 */ +/* #undef HAVE_HOST_CPU_s390_z10 */ +/* #undef HAVE_HOST_CPU_s390_z196 */ + +/* Define to 1 iff we have a s390 with 64-bit registers. */ +/* #undef HAVE_HOST_CPU_s390_zarch */ + +/* Define to 1 if the system has the type `intmax_t'. */ +#define HAVE_INTMAX_T 1 + +/* Define to 1 if the system has the type `intptr_t'. */ +#define HAVE_INTPTR_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_INVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LANGINFO_H 1 + +/* Define one of these to 1 for the endianness of `mp_limb_t'. + If the endianness is not a simple big or little, or you don't know what + it is, then leave both undefined. */ +/* #undef HAVE_LIMB_BIG_ENDIAN */ +#define HAVE_LIMB_LITTLE_ENDIAN 1 + +/* Define to 1 if you have the `localeconv' function. */ +#define HAVE_LOCALECONV 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if the system has the type `long double'. */ +#define HAVE_LONG_DOUBLE 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_MACHINE_HAL_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memset' function. */ +#define HAVE_MEMSET 1 + +/* Define to 1 if you have the `mmap' function. */ +#define HAVE_MMAP 1 + +/* Define to 1 if you have the `mprotect' function. */ +#define HAVE_MPROTECT 1 + +/* Define to 1 each of the following for which a native (ie. CPU specific) + implementation of the corresponding routine exists. */ +#define HAVE_NATIVE_mpn_add_n 1 +/* #undef HAVE_NATIVE_mpn_add_n_sub_n */ +#define HAVE_NATIVE_mpn_add_nc 1 +/* #undef HAVE_NATIVE_mpn_addaddmul_1msb0 */ +#define HAVE_NATIVE_mpn_addlsh1_n 1 +#define HAVE_NATIVE_mpn_addlsh2_n 1 +/* #undef HAVE_NATIVE_mpn_addlsh_n */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addmul_1c */ +/* #undef HAVE_NATIVE_mpn_addmul_2 */ +/* #undef HAVE_NATIVE_mpn_addmul_3 */ +/* #undef HAVE_NATIVE_mpn_addmul_4 */ +/* #undef HAVE_NATIVE_mpn_addmul_5 */ +/* #undef HAVE_NATIVE_mpn_addmul_6 */ +/* #undef HAVE_NATIVE_mpn_addmul_7 */ +/* #undef HAVE_NATIVE_mpn_addmul_8 */ +/* #undef HAVE_NATIVE_mpn_addmul_2s */ +#define HAVE_NATIVE_mpn_and_n 1 +#define HAVE_NATIVE_mpn_andn_n 1 +#define HAVE_NATIVE_mpn_bdiv_dbm1c 1 +#define HAVE_NATIVE_mpn_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_pi1_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_cnd_add_n 1 +#define HAVE_NATIVE_mpn_cnd_sub_n 1 +#define HAVE_NATIVE_mpn_com 1 +#define HAVE_NATIVE_mpn_copyd 1 +#define HAVE_NATIVE_mpn_copyi 1 +/* #undef HAVE_NATIVE_mpn_div_qr_1n_pi1 */ +/* #undef HAVE_NATIVE_mpn_div_qr_2 */ +/* #undef HAVE_NATIVE_mpn_divexact_1 */ +/* #undef HAVE_NATIVE_mpn_divexact_by3c */ +/* #undef HAVE_NATIVE_mpn_divrem_1 */ +/* #undef HAVE_NATIVE_mpn_divrem_1c */ +/* #undef HAVE_NATIVE_mpn_divrem_2 */ +/* #undef HAVE_NATIVE_mpn_gcd_1 */ +#define HAVE_NATIVE_mpn_gcd_11 1 +#define HAVE_NATIVE_mpn_gcd_22 1 +#define HAVE_NATIVE_mpn_hamdist 1 +#define HAVE_NATIVE_mpn_invert_limb 1 +#define HAVE_NATIVE_mpn_ior_n 1 +#define HAVE_NATIVE_mpn_iorn_n 1 +#define HAVE_NATIVE_mpn_lshift 1 +#define HAVE_NATIVE_mpn_lshiftc 1 +/* #undef HAVE_NATIVE_mpn_lshsub_n */ +/* #undef HAVE_NATIVE_mpn_mod_1 */ +/* #undef HAVE_NATIVE_mpn_mod_1_1p */ +/* #undef HAVE_NATIVE_mpn_mod_1c */ +/* #undef HAVE_NATIVE_mpn_mod_1s_2p */ +/* #undef HAVE_NATIVE_mpn_mod_1s_4p */ +#define HAVE_NATIVE_mpn_mod_34lsub1 1 +/* #undef HAVE_NATIVE_mpn_modexact_1_odd */ +/* #undef HAVE_NATIVE_mpn_modexact_1c_odd */ +#define HAVE_NATIVE_mpn_mul_1 1 +#define HAVE_NATIVE_mpn_mul_1c 1 +/* #undef HAVE_NATIVE_mpn_mul_2 */ +/* #undef HAVE_NATIVE_mpn_mul_3 */ +/* #undef HAVE_NATIVE_mpn_mul_4 */ +/* #undef HAVE_NATIVE_mpn_mul_5 */ +/* #undef HAVE_NATIVE_mpn_mul_6 */ +/* #undef HAVE_NATIVE_mpn_mul_basecase */ +/* #undef HAVE_NATIVE_mpn_mullo_basecase */ +#define HAVE_NATIVE_mpn_nand_n 1 +#define HAVE_NATIVE_mpn_nior_n 1 +#define HAVE_NATIVE_mpn_popcount 1 +/* #undef HAVE_NATIVE_mpn_preinv_divrem_1 */ +/* #undef HAVE_NATIVE_mpn_preinv_mod_1 */ +/* #undef HAVE_NATIVE_mpn_redc_1 */ +/* #undef HAVE_NATIVE_mpn_redc_2 */ +#define HAVE_NATIVE_mpn_rsblsh1_n 1 +#define HAVE_NATIVE_mpn_rsblsh2_n 1 +/* #undef HAVE_NATIVE_mpn_rsblsh_n */ +/* #undef HAVE_NATIVE_mpn_rsblsh1_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh2_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh_nc */ +#define HAVE_NATIVE_mpn_rsh1add_n 1 +/* #undef HAVE_NATIVE_mpn_rsh1add_nc */ +#define HAVE_NATIVE_mpn_rsh1sub_n 1 +/* #undef HAVE_NATIVE_mpn_rsh1sub_nc */ +#define HAVE_NATIVE_mpn_rshift 1 +/* #undef HAVE_NATIVE_mpn_sbpi1_bdiv_r */ +/* #undef HAVE_NATIVE_mpn_sqr_basecase */ +/* #undef HAVE_NATIVE_mpn_sqr_diagonal */ +#define HAVE_NATIVE_mpn_sqr_diag_addlsh1 1 +#define HAVE_NATIVE_mpn_sub_n 1 +#define HAVE_NATIVE_mpn_sub_nc 1 +#define HAVE_NATIVE_mpn_sublsh1_n 1 +#define HAVE_NATIVE_mpn_sublsh2_n 1 +/* #undef HAVE_NATIVE_mpn_sublsh_n */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_submul_1c */ +/* #undef HAVE_NATIVE_mpn_tabselect */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd_r */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm_r */ +#define HAVE_NATIVE_mpn_xor_n 1 +#define HAVE_NATIVE_mpn_xnor_n 1 + +/* Define to 1 if you have the `nl_langinfo' function. */ +#define HAVE_NL_LANGINFO 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_NL_TYPES_H 1 + +/* Define to 1 if you have the `obstack_vprintf' function. */ +/* #define HAVE_OBSTACK_VPRINTF 1 */ + +/* Define to 1 if you have the `popen' function. */ +#define HAVE_POPEN 1 + +/* Define to 1 if you have the `processor_info' function. */ +/* #undef HAVE_PROCESSOR_INFO */ + +/* Define to 1 if `struct pst_processor' exists and contains + `psp_iticksperclktick'. */ +/* #undef HAVE_PSP_ITICKSPERCLKTICK */ + +/* Define to 1 if you have the `pstat_getprocessor' function. */ +/* #undef HAVE_PSTAT_GETPROCESSOR */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if the system has the type `quad_t'. */ +#define HAVE_QUAD_T 1 + +/* Define to 1 if you have the `raise' function. */ +#define HAVE_RAISE 1 + +/* Define to 1 if you have the `read_real_time' function. */ +/* #undef HAVE_READ_REAL_TIME */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `sigaltstack' function. */ +#define HAVE_SIGALTSTACK 1 + +/* Define to 1 if you have the `sigstack' function. */ +#define HAVE_SIGSTACK 1 + +/* Tune directory speed_cyclecounter, undef=none, 1=32bits, 2=64bits) */ +/* #undef HAVE_SPEED_CYCLECOUNTER */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SSTREAM */ + +/* Define to 1 if the system has the type `stack_t'. */ +#define HAVE_STACK_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if the system has the type `std::locale'. */ +/* #undef HAVE_STD__LOCALE */ + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strnlen' function. */ +#define HAVE_STRNLEN 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the `sysctl' function. */ +/* #undef HAVE_SYSCTL */ + +/* Define to 1 if you have the `sysctlbyname' function. */ +/* #undef HAVE_SYSCTLBYNAME */ + +/* Define to 1 if you have the `syssgi' function. */ +/* #undef HAVE_SYSSGI */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_ATTRIBUTES_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_IOGRAPH_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_MMAN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PROCESSOR_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PSTAT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSCTL_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSINFO_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSSGI_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSTEMCFG_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIMES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the `times' function. */ +#define HAVE_TIMES 1 + +/* Define to 1 if the system has the type `uint_least32_t'. */ +#define HAVE_UINT_LEAST32_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the `vsnprintf' function and it works properly. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 for Windos/64 */ +/* #undef HOST_DOS64 */ + +/* Assembler local label prefix */ +#define LSYM_PREFIX ".L" + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* Define to 1 to disable the use of inline assembly */ +/* #undef NO_ASM */ + +/* Name of package */ +#define PACKAGE "gmp" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "GNU MP" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "GNU MP 6.2.1" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "gmp" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "http://www.gnu.org/software/gmp/" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "6.2.1" + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* The size of `mp_limb_t', as computed by sizeof. */ +#define SIZEOF_MP_LIMB_T 8 + +/* The size of `unsigned', as computed by sizeof. */ +#define SIZEOF_UNSIGNED 4 + +/* The size of `unsigned long', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_LONG 8 + +/* The size of `unsigned short', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_SHORT 2 + +/* The size of `void *', as computed by sizeof. */ +#define SIZEOF_VOID_P 8 + +/* Define to 1 if sscanf requires writable inputs */ +/* #undef SSCANF_WRITABLE_INPUT */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Maximum size the tune program can test for SQR_TOOM2_THRESHOLD */ +#define TUNE_SQR_TOOM2_MAX SQR_TOOM2_MAX_GENERIC + +/* Version number of package */ +#define VERSION "6.2.1" + +/* Define to 1 to enable ASSERT checking, per --enable-assert */ +/* #undef WANT_ASSERT */ + +/* Define to 1 to enable GMP_CPU_TYPE faking cpuid, per --enable-fake-cpuid */ +/* #undef WANT_FAKE_CPUID */ + +/* Define to 1 when building a fat binary. */ +/* #undef WANT_FAT_BINARY */ + +/* Define to 1 to enable FFTs for multiplication, per --enable-fft */ +#define WANT_FFT 1 + +/* Define to 1 to enable old mpn_mul_fft_full for multiplication, per + --enable-old-fft-full */ +/* #undef WANT_OLD_FFT_FULL */ + +/* Define to 1 if --enable-profiling=gprof */ +/* #undef WANT_PROFILING_GPROF */ + +/* Define to 1 if --enable-profiling=instrument */ +/* #undef WANT_PROFILING_INSTRUMENT */ + +/* Define to 1 if --enable-profiling=prof */ +/* #undef WANT_PROFILING_PROF */ + +/* Define one of these to 1 for the desired temporary memory allocation + method, per --enable-alloca. */ +#define WANT_TMP_ALLOCA 1 +/* #undef WANT_TMP_REENTRANT */ +/* #undef WANT_TMP_NOTREENTRANT */ +/* #undef WANT_TMP_DEBUG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to 1 if the assembler understands the mulx instruction */ +/* #undef X86_ASM_MULX */ + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +/* #undef YYTEXT_POINTER */ + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to the equivalent of the C99 'restrict' keyword, or to + nothing if this is not supported. Do not define if restrict is + supported directly. */ +#define restrict __restrict +/* Work around a bug in Sun C++: it does not support _Restrict or + __restrict__, even though the corresponding Sun C compiler ends up with + "#define restrict _Restrict" or "#define restrict __restrict__" in the + previous line. Perhaps some future version of Sun C++ will work with + restrict; if so, hopefully it defines __RESTRICT like Sun C does. */ +#if defined __SUNPRO_CC && !defined __RESTRICT +# define _Restrict +# define __restrict__ +#endif + +/* Define to empty if the keyword `volatile' does not work. Warning: valid + code using `volatile' can become incorrect without. Disable with care. */ +/* #undef volatile */ diff --git a/ext/gmp/gen/aarch64-macos/config.h b/ext/gmp/gen/aarch64-macos/config.h new file mode 100644 index 0000000000..dd1ca7f842 --- /dev/null +++ b/ext/gmp/gen/aarch64-macos/config.h @@ -0,0 +1,668 @@ +/* config.h. Generated from config.in by configure. */ +/* config.in. Generated from configure.ac by autoheader. */ + +/* + +Copyright 1996-2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. +*/ + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* The gmp-mparam.h file (a string) the tune program should suggest updating. + */ +#define GMP_MPARAM_H_SUGGEST "./mpn/arm64/gmp-mparam.h" + +/* Define to 1 if you have the `alarm' function. */ +#define HAVE_ALARM 1 + +/* Define to 1 if alloca() works (via gmp-impl.h). */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#define HAVE_ALLOCA_H 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((const)) */ +#define HAVE_ATTRIBUTE_CONST 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((malloc)) */ +#define HAVE_ATTRIBUTE_MALLOC 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((mode (XX))) + */ +#define HAVE_ATTRIBUTE_MODE 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((noreturn)) */ +#define HAVE_ATTRIBUTE_NORETURN 1 + +/* Define to 1 if you have the `attr_get' function. */ +/* #undef HAVE_ATTR_GET */ + +/* Define to 1 if tests/libtests has calling conventions checking for the CPU + */ +/* #undef HAVE_CALLING_CONVENTIONS */ + +/* Define to 1 if you have the `clock' function. */ +#define HAVE_CLOCK 1 + +/* Define to 1 if you have the `clock_gettime' function */ +#define HAVE_CLOCK_GETTIME 1 + +/* Define to 1 if you have the `cputime' function. */ +/* #undef HAVE_CPUTIME */ + +/* Define to 1 if you have the declaration of `fgetc', and to 0 if you don't. + */ +#define HAVE_DECL_FGETC 1 + +/* Define to 1 if you have the declaration of `fscanf', and to 0 if you don't. + */ +#define HAVE_DECL_FSCANF 1 + +/* Define to 1 if you have the declaration of `optarg', and to 0 if you don't. + */ +#define HAVE_DECL_OPTARG 1 + +/* Define to 1 if you have the declaration of `sys_errlist', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_ERRLIST 1 + +/* Define to 1 if you have the declaration of `sys_nerr', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_NERR 1 + +/* Define to 1 if you have the declaration of `ungetc', and to 0 if you don't. + */ +#define HAVE_DECL_UNGETC 1 + +/* Define to 1 if you have the declaration of `vfprintf', and to 0 if you + don't. */ +#define HAVE_DECL_VFPRINTF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define one of the following to 1 for the format of a `double'. + If your format is not among these choices, or you don't know what it is, + then leave all undefined. + IEEE_LITTLE_SWAPPED means little endian, but with the two 4-byte halves + swapped, as used by ARM CPUs in little endian mode. */ +/* #undef HAVE_DOUBLE_IEEE_BIG_ENDIAN */ +#define HAVE_DOUBLE_IEEE_LITTLE_ENDIAN 1 +/* #undef HAVE_DOUBLE_IEEE_LITTLE_SWAPPED */ +/* #undef HAVE_DOUBLE_VAX_D */ +/* #undef HAVE_DOUBLE_VAX_G */ +/* #undef HAVE_DOUBLE_CRAY_CFP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FLOAT_H 1 + +/* Define to 1 if you have the `getpagesize' function. */ +#define HAVE_GETPAGESIZE 1 + +/* Define to 1 if you have the `getrusage' function. */ +#define HAVE_GETRUSAGE 1 + +/* Define to 1 if you have the `getsysinfo' function. */ +/* #undef HAVE_GETSYSINFO */ + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((visibility)) + and __attribute__ ((alias)) */ +/* #undef HAVE_HIDDEN_ALIAS */ + +/* Define one of these to 1 for the host CPU family. + If your CPU is not in any of these families, leave all undefined. + For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */ +/* #undef HAVE_HOST_CPU_FAMILY_alpha */ +/* #undef HAVE_HOST_CPU_FAMILY_m68k */ +/* #undef HAVE_HOST_CPU_FAMILY_power */ +/* #undef HAVE_HOST_CPU_FAMILY_powerpc */ +/* #undef HAVE_HOST_CPU_FAMILY_x86 */ +/* #undef HAVE_HOST_CPU_FAMILY_x86_64 */ + +/* Define one of the following to 1 for the host CPU, as per the output of + ./config.guess. If your CPU is not listed here, leave all undefined. */ +/* #undef HAVE_HOST_CPU_alphaev67 */ +/* #undef HAVE_HOST_CPU_alphaev68 */ +/* #undef HAVE_HOST_CPU_alphaev7 */ +/* #undef HAVE_HOST_CPU_m68020 */ +/* #undef HAVE_HOST_CPU_m68030 */ +/* #undef HAVE_HOST_CPU_m68040 */ +/* #undef HAVE_HOST_CPU_m68060 */ +/* #undef HAVE_HOST_CPU_m68360 */ +/* #undef HAVE_HOST_CPU_powerpc604 */ +/* #undef HAVE_HOST_CPU_powerpc604e */ +/* #undef HAVE_HOST_CPU_powerpc750 */ +/* #undef HAVE_HOST_CPU_powerpc7400 */ +/* #undef HAVE_HOST_CPU_supersparc */ +/* #undef HAVE_HOST_CPU_i386 */ +/* #undef HAVE_HOST_CPU_i586 */ +/* #undef HAVE_HOST_CPU_i686 */ +/* #undef HAVE_HOST_CPU_pentium */ +/* #undef HAVE_HOST_CPU_pentiummmx */ +/* #undef HAVE_HOST_CPU_pentiumpro */ +/* #undef HAVE_HOST_CPU_pentium2 */ +/* #undef HAVE_HOST_CPU_pentium3 */ +/* #undef HAVE_HOST_CPU_pentium4 */ +/* #undef HAVE_HOST_CPU_core2 */ +/* #undef HAVE_HOST_CPU_nehalem */ +/* #undef HAVE_HOST_CPU_westmere */ +/* #undef HAVE_HOST_CPU_sandybridge */ +/* #undef HAVE_HOST_CPU_ivybridge */ +/* #undef HAVE_HOST_CPU_haswell */ +/* #undef HAVE_HOST_CPU_broadwell */ +/* #undef HAVE_HOST_CPU_skylake */ +/* #undef HAVE_HOST_CPU_silvermont */ +/* #undef HAVE_HOST_CPU_goldmont */ +/* #undef HAVE_HOST_CPU_k8 */ +/* #undef HAVE_HOST_CPU_k10 */ +/* #undef HAVE_HOST_CPU_bulldozer */ +/* #undef HAVE_HOST_CPU_piledriver */ +/* #undef HAVE_HOST_CPU_steamroller */ +/* #undef HAVE_HOST_CPU_excavator */ +/* #undef HAVE_HOST_CPU_zen */ +/* #undef HAVE_HOST_CPU_bobcat */ +/* #undef HAVE_HOST_CPU_jaguar */ +/* #undef HAVE_HOST_CPU_s390_z900 */ +/* #undef HAVE_HOST_CPU_s390_z990 */ +/* #undef HAVE_HOST_CPU_s390_z9 */ +/* #undef HAVE_HOST_CPU_s390_z10 */ +/* #undef HAVE_HOST_CPU_s390_z196 */ + +/* Define to 1 iff we have a s390 with 64-bit registers. */ +/* #undef HAVE_HOST_CPU_s390_zarch */ + +/* Define to 1 if the system has the type `intmax_t'. */ +#define HAVE_INTMAX_T 1 + +/* Define to 1 if the system has the type `intptr_t'. */ +#define HAVE_INTPTR_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_INVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LANGINFO_H 1 + +/* Define one of these to 1 for the endianness of `mp_limb_t'. + If the endianness is not a simple big or little, or you don't know what + it is, then leave both undefined. */ +/* #undef HAVE_LIMB_BIG_ENDIAN */ +#define HAVE_LIMB_LITTLE_ENDIAN 1 + +/* Define to 1 if you have the `localeconv' function. */ +#define HAVE_LOCALECONV 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if the system has the type `long double'. */ +#define HAVE_LONG_DOUBLE 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_MACHINE_HAL_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memset' function. */ +#define HAVE_MEMSET 1 + +/* Define to 1 if you have the `mmap' function. */ +#define HAVE_MMAP 1 + +/* Define to 1 if you have the `mprotect' function. */ +#define HAVE_MPROTECT 1 + +/* Define to 1 each of the following for which a native (ie. CPU specific) + implementation of the corresponding routine exists. */ +#define HAVE_NATIVE_mpn_add_n 1 +/* #undef HAVE_NATIVE_mpn_add_n_sub_n */ +#define HAVE_NATIVE_mpn_add_nc 1 +/* #undef HAVE_NATIVE_mpn_addaddmul_1msb0 */ +#define HAVE_NATIVE_mpn_addlsh1_n 1 +#define HAVE_NATIVE_mpn_addlsh2_n 1 +/* #undef HAVE_NATIVE_mpn_addlsh_n */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addmul_1c */ +/* #undef HAVE_NATIVE_mpn_addmul_2 */ +/* #undef HAVE_NATIVE_mpn_addmul_3 */ +/* #undef HAVE_NATIVE_mpn_addmul_4 */ +/* #undef HAVE_NATIVE_mpn_addmul_5 */ +/* #undef HAVE_NATIVE_mpn_addmul_6 */ +/* #undef HAVE_NATIVE_mpn_addmul_7 */ +/* #undef HAVE_NATIVE_mpn_addmul_8 */ +/* #undef HAVE_NATIVE_mpn_addmul_2s */ +#define HAVE_NATIVE_mpn_and_n 1 +#define HAVE_NATIVE_mpn_andn_n 1 +#define HAVE_NATIVE_mpn_bdiv_dbm1c 1 +#define HAVE_NATIVE_mpn_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_pi1_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_cnd_add_n 1 +#define HAVE_NATIVE_mpn_cnd_sub_n 1 +#define HAVE_NATIVE_mpn_com 1 +#define HAVE_NATIVE_mpn_copyd 1 +#define HAVE_NATIVE_mpn_copyi 1 +/* #undef HAVE_NATIVE_mpn_div_qr_1n_pi1 */ +/* #undef HAVE_NATIVE_mpn_div_qr_2 */ +/* #undef HAVE_NATIVE_mpn_divexact_1 */ +/* #undef HAVE_NATIVE_mpn_divexact_by3c */ +/* #undef HAVE_NATIVE_mpn_divrem_1 */ +/* #undef HAVE_NATIVE_mpn_divrem_1c */ +/* #undef HAVE_NATIVE_mpn_divrem_2 */ +/* #undef HAVE_NATIVE_mpn_gcd_1 */ +#define HAVE_NATIVE_mpn_gcd_11 1 +#define HAVE_NATIVE_mpn_gcd_22 1 +#define HAVE_NATIVE_mpn_hamdist 1 +#define HAVE_NATIVE_mpn_invert_limb 1 +#define HAVE_NATIVE_mpn_ior_n 1 +#define HAVE_NATIVE_mpn_iorn_n 1 +#define HAVE_NATIVE_mpn_lshift 1 +#define HAVE_NATIVE_mpn_lshiftc 1 +/* #undef HAVE_NATIVE_mpn_lshsub_n */ +/* #undef HAVE_NATIVE_mpn_mod_1 */ +/* #undef HAVE_NATIVE_mpn_mod_1_1p */ +/* #undef HAVE_NATIVE_mpn_mod_1c */ +/* #undef HAVE_NATIVE_mpn_mod_1s_2p */ +/* #undef HAVE_NATIVE_mpn_mod_1s_4p */ +#define HAVE_NATIVE_mpn_mod_34lsub1 1 +/* #undef HAVE_NATIVE_mpn_modexact_1_odd */ +/* #undef HAVE_NATIVE_mpn_modexact_1c_odd */ +#define HAVE_NATIVE_mpn_mul_1 1 +#define HAVE_NATIVE_mpn_mul_1c 1 +/* #undef HAVE_NATIVE_mpn_mul_2 */ +/* #undef HAVE_NATIVE_mpn_mul_3 */ +/* #undef HAVE_NATIVE_mpn_mul_4 */ +/* #undef HAVE_NATIVE_mpn_mul_5 */ +/* #undef HAVE_NATIVE_mpn_mul_6 */ +/* #undef HAVE_NATIVE_mpn_mul_basecase */ +/* #undef HAVE_NATIVE_mpn_mullo_basecase */ +#define HAVE_NATIVE_mpn_nand_n 1 +#define HAVE_NATIVE_mpn_nior_n 1 +#define HAVE_NATIVE_mpn_popcount 1 +/* #undef HAVE_NATIVE_mpn_preinv_divrem_1 */ +/* #undef HAVE_NATIVE_mpn_preinv_mod_1 */ +/* #undef HAVE_NATIVE_mpn_redc_1 */ +/* #undef HAVE_NATIVE_mpn_redc_2 */ +#define HAVE_NATIVE_mpn_rsblsh1_n 1 +#define HAVE_NATIVE_mpn_rsblsh2_n 1 +/* #undef HAVE_NATIVE_mpn_rsblsh_n */ +/* #undef HAVE_NATIVE_mpn_rsblsh1_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh2_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh_nc */ +#define HAVE_NATIVE_mpn_rsh1add_n 1 +/* #undef HAVE_NATIVE_mpn_rsh1add_nc */ +#define HAVE_NATIVE_mpn_rsh1sub_n 1 +/* #undef HAVE_NATIVE_mpn_rsh1sub_nc */ +#define HAVE_NATIVE_mpn_rshift 1 +/* #undef HAVE_NATIVE_mpn_sbpi1_bdiv_r */ +/* #undef HAVE_NATIVE_mpn_sqr_basecase */ +/* #undef HAVE_NATIVE_mpn_sqr_diagonal */ +#define HAVE_NATIVE_mpn_sqr_diag_addlsh1 1 +#define HAVE_NATIVE_mpn_sub_n 1 +#define HAVE_NATIVE_mpn_sub_nc 1 +#define HAVE_NATIVE_mpn_sublsh1_n 1 +#define HAVE_NATIVE_mpn_sublsh2_n 1 +/* #undef HAVE_NATIVE_mpn_sublsh_n */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_submul_1c */ +/* #undef HAVE_NATIVE_mpn_tabselect */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd_r */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm_r */ +#define HAVE_NATIVE_mpn_xor_n 1 +#define HAVE_NATIVE_mpn_xnor_n 1 + +/* Define to 1 if you have the `nl_langinfo' function. */ +#define HAVE_NL_LANGINFO 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_NL_TYPES_H 1 + +/* Define to 1 if you have the `obstack_vprintf' function. */ +/* #undef HAVE_OBSTACK_VPRINTF */ + +/* Define to 1 if you have the `popen' function. */ +#define HAVE_POPEN 1 + +/* Define to 1 if you have the `processor_info' function. */ +#define HAVE_PROCESSOR_INFO 1 + +/* Define to 1 if `struct pst_processor' exists and contains + `psp_iticksperclktick'. */ +/* #undef HAVE_PSP_ITICKSPERCLKTICK */ + +/* Define to 1 if you have the `pstat_getprocessor' function. */ +/* #undef HAVE_PSTAT_GETPROCESSOR */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if the system has the type `quad_t'. */ +#define HAVE_QUAD_T 1 + +/* Define to 1 if you have the `raise' function. */ +#define HAVE_RAISE 1 + +/* Define to 1 if you have the `read_real_time' function. */ +/* #undef HAVE_READ_REAL_TIME */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `sigaltstack' function. */ +#define HAVE_SIGALTSTACK 1 + +/* Define to 1 if you have the `sigstack' function. */ +/* #undef HAVE_SIGSTACK */ + +/* Tune directory speed_cyclecounter, undef=none, 1=32bits, 2=64bits) */ +/* #undef HAVE_SPEED_CYCLECOUNTER */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SSTREAM */ + +/* Define to 1 if the system has the type `stack_t'. */ +#define HAVE_STACK_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if the system has the type `std::locale'. */ +/* #undef HAVE_STD__LOCALE */ + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strnlen' function. */ +#define HAVE_STRNLEN 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the `sysctl' function. */ +#define HAVE_SYSCTL 1 + +/* Define to 1 if you have the `sysctlbyname' function. */ +#define HAVE_SYSCTLBYNAME 1 + +/* Define to 1 if you have the `syssgi' function. */ +/* #undef HAVE_SYSSGI */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_ATTRIBUTES_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_IOGRAPH_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_MMAN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PROCESSOR_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PSTAT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSCTL_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSSGI_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSTEMCFG_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIMES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the `times' function. */ +#define HAVE_TIMES 1 + +/* Define to 1 if the system has the type `uint_least32_t'. */ +#define HAVE_UINT_LEAST32_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the `vsnprintf' function and it works properly. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 for Windos/64 */ +/* #undef HOST_DOS64 */ + +/* Assembler local label prefix */ +#define LSYM_PREFIX "L" + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* Define to 1 to disable the use of inline assembly */ +/* #undef NO_ASM */ + +/* Name of package */ +#define PACKAGE "gmp" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "GNU MP" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "GNU MP 6.2.1" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "gmp" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "http://www.gnu.org/software/gmp/" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "6.2.1" + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* The size of `mp_limb_t', as computed by sizeof. */ +#define SIZEOF_MP_LIMB_T 8 + +/* The size of `unsigned', as computed by sizeof. */ +#define SIZEOF_UNSIGNED 4 + +/* The size of `unsigned long', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_LONG 8 + +/* The size of `unsigned short', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_SHORT 2 + +/* The size of `void *', as computed by sizeof. */ +#define SIZEOF_VOID_P 8 + +/* Define to 1 if sscanf requires writable inputs */ +/* #undef SSCANF_WRITABLE_INPUT */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Maximum size the tune program can test for SQR_TOOM2_THRESHOLD */ +#define TUNE_SQR_TOOM2_MAX SQR_TOOM2_MAX_GENERIC + +/* Version number of package */ +#define VERSION "6.2.1" + +/* Define to 1 to enable ASSERT checking, per --enable-assert */ +/* #undef WANT_ASSERT */ + +/* Define to 1 to enable GMP_CPU_TYPE faking cpuid, per --enable-fake-cpuid */ +/* #undef WANT_FAKE_CPUID */ + +/* Define to 1 when building a fat binary. */ +/* #undef WANT_FAT_BINARY */ + +/* Define to 1 to enable FFTs for multiplication, per --enable-fft */ +#define WANT_FFT 1 + +/* Define to 1 to enable old mpn_mul_fft_full for multiplication, per + --enable-old-fft-full */ +/* #undef WANT_OLD_FFT_FULL */ + +/* Define to 1 if --enable-profiling=gprof */ +/* #undef WANT_PROFILING_GPROF */ + +/* Define to 1 if --enable-profiling=instrument */ +/* #undef WANT_PROFILING_INSTRUMENT */ + +/* Define to 1 if --enable-profiling=prof */ +/* #undef WANT_PROFILING_PROF */ + +/* Define one of these to 1 for the desired temporary memory allocation + method, per --enable-alloca. */ +#define WANT_TMP_ALLOCA 1 +/* #undef WANT_TMP_REENTRANT */ +/* #undef WANT_TMP_NOTREENTRANT */ +/* #undef WANT_TMP_DEBUG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to 1 if the assembler understands the mulx instruction */ +/* #undef X86_ASM_MULX */ + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +#define YYTEXT_POINTER 1 + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to the equivalent of the C99 'restrict' keyword, or to + nothing if this is not supported. Do not define if restrict is + supported directly. */ +#define restrict __restrict +/* Work around a bug in Sun C++: it does not support _Restrict or + __restrict__, even though the corresponding Sun C compiler ends up with + "#define restrict _Restrict" or "#define restrict __restrict__" in the + previous line. Perhaps some future version of Sun C++ will work with + restrict; if so, hopefully it defines __RESTRICT like Sun C does. */ +#if defined __SUNPRO_CC && !defined __RESTRICT +# define _Restrict +# define __restrict__ +#endif + +/* Define to empty if the keyword `volatile' does not work. Warning: valid + code using `volatile' can become incorrect without. Disable with care. */ +/* #undef volatile */ diff --git a/ext/gmp/gen/x86_64-linux/config.h b/ext/gmp/gen/x86_64-linux/config.h new file mode 100644 index 0000000000..47840ffb13 --- /dev/null +++ b/ext/gmp/gen/x86_64-linux/config.h @@ -0,0 +1,668 @@ +/* config.h. Generated from config.in by configure. */ +/* config.in. Generated from configure.ac by autoheader. */ + +/* + +Copyright 1996-2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. +*/ + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* The gmp-mparam.h file (a string) the tune program should suggest updating. + */ +#define GMP_MPARAM_H_SUGGEST "./mpn/x86_64/k8/gmp-mparam.h" + +/* Define to 1 if you have the `alarm' function. */ +#define HAVE_ALARM 1 + +/* Define to 1 if alloca() works (via gmp-impl.h). */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#define HAVE_ALLOCA_H 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((const)) */ +#define HAVE_ATTRIBUTE_CONST 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((malloc)) */ +#define HAVE_ATTRIBUTE_MALLOC 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((mode (XX))) + */ +#define HAVE_ATTRIBUTE_MODE 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((noreturn)) */ +#define HAVE_ATTRIBUTE_NORETURN 1 + +/* Define to 1 if you have the `attr_get' function. */ +/* #undef HAVE_ATTR_GET */ + +/* Define to 1 if tests/libtests has calling conventions checking for the CPU + */ +#define HAVE_CALLING_CONVENTIONS 1 + +/* Define to 1 if you have the `clock' function. */ +#define HAVE_CLOCK 1 + +/* Define to 1 if you have the `clock_gettime' function */ +#define HAVE_CLOCK_GETTIME 1 + +/* Define to 1 if you have the `cputime' function. */ +/* #undef HAVE_CPUTIME */ + +/* Define to 1 if you have the declaration of `fgetc', and to 0 if you don't. + */ +#define HAVE_DECL_FGETC 1 + +/* Define to 1 if you have the declaration of `fscanf', and to 0 if you don't. + */ +#define HAVE_DECL_FSCANF 1 + +/* Define to 1 if you have the declaration of `optarg', and to 0 if you don't. + */ +#define HAVE_DECL_OPTARG 1 + +/* Define to 1 if you have the declaration of `sys_errlist', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_ERRLIST 0 + +/* Define to 1 if you have the declaration of `sys_nerr', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_NERR 0 + +/* Define to 1 if you have the declaration of `ungetc', and to 0 if you don't. + */ +#define HAVE_DECL_UNGETC 1 + +/* Define to 1 if you have the declaration of `vfprintf', and to 0 if you + don't. */ +#define HAVE_DECL_VFPRINTF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define one of the following to 1 for the format of a `double'. + If your format is not among these choices, or you don't know what it is, + then leave all undefined. + IEEE_LITTLE_SWAPPED means little endian, but with the two 4-byte halves + swapped, as used by ARM CPUs in little endian mode. */ +/* #undef HAVE_DOUBLE_IEEE_BIG_ENDIAN */ +#define HAVE_DOUBLE_IEEE_LITTLE_ENDIAN 1 +/* #undef HAVE_DOUBLE_IEEE_LITTLE_SWAPPED */ +/* #undef HAVE_DOUBLE_VAX_D */ +/* #undef HAVE_DOUBLE_VAX_G */ +/* #undef HAVE_DOUBLE_CRAY_CFP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FLOAT_H 1 + +/* Define to 1 if you have the `getpagesize' function. */ +#define HAVE_GETPAGESIZE 1 + +/* Define to 1 if you have the `getrusage' function. */ +#define HAVE_GETRUSAGE 1 + +/* Define to 1 if you have the `getsysinfo' function. */ +/* #undef HAVE_GETSYSINFO */ + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((visibility)) + and __attribute__ ((alias)) */ +#define HAVE_HIDDEN_ALIAS 1 + +/* Define one of these to 1 for the host CPU family. + If your CPU is not in any of these families, leave all undefined. + For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */ +/* #undef HAVE_HOST_CPU_FAMILY_alpha */ +/* #undef HAVE_HOST_CPU_FAMILY_m68k */ +/* #undef HAVE_HOST_CPU_FAMILY_power */ +/* #undef HAVE_HOST_CPU_FAMILY_powerpc */ +/* #undef HAVE_HOST_CPU_FAMILY_x86 */ +#define HAVE_HOST_CPU_FAMILY_x86_64 1 + +/* Define one of the following to 1 for the host CPU, as per the output of + ./config.guess. If your CPU is not listed here, leave all undefined. */ +/* #undef HAVE_HOST_CPU_alphaev67 */ +/* #undef HAVE_HOST_CPU_alphaev68 */ +/* #undef HAVE_HOST_CPU_alphaev7 */ +/* #undef HAVE_HOST_CPU_m68020 */ +/* #undef HAVE_HOST_CPU_m68030 */ +/* #undef HAVE_HOST_CPU_m68040 */ +/* #undef HAVE_HOST_CPU_m68060 */ +/* #undef HAVE_HOST_CPU_m68360 */ +/* #undef HAVE_HOST_CPU_powerpc604 */ +/* #undef HAVE_HOST_CPU_powerpc604e */ +/* #undef HAVE_HOST_CPU_powerpc750 */ +/* #undef HAVE_HOST_CPU_powerpc7400 */ +/* #undef HAVE_HOST_CPU_supersparc */ +/* #undef HAVE_HOST_CPU_i386 */ +/* #undef HAVE_HOST_CPU_i586 */ +/* #undef HAVE_HOST_CPU_i686 */ +/* #undef HAVE_HOST_CPU_pentium */ +/* #undef HAVE_HOST_CPU_pentiummmx */ +/* #undef HAVE_HOST_CPU_pentiumpro */ +/* #undef HAVE_HOST_CPU_pentium2 */ +/* #undef HAVE_HOST_CPU_pentium3 */ +/* #undef HAVE_HOST_CPU_pentium4 */ +/* #undef HAVE_HOST_CPU_core2 */ +/* #undef HAVE_HOST_CPU_nehalem */ +/* #undef HAVE_HOST_CPU_westmere */ +/* #undef HAVE_HOST_CPU_sandybridge */ +/* #undef HAVE_HOST_CPU_ivybridge */ +/* #undef HAVE_HOST_CPU_haswell */ +/* #undef HAVE_HOST_CPU_broadwell */ +/* #undef HAVE_HOST_CPU_skylake */ +/* #undef HAVE_HOST_CPU_silvermont */ +/* #undef HAVE_HOST_CPU_goldmont */ +/* #undef HAVE_HOST_CPU_k8 */ +/* #undef HAVE_HOST_CPU_k10 */ +/* #undef HAVE_HOST_CPU_bulldozer */ +/* #undef HAVE_HOST_CPU_piledriver */ +/* #undef HAVE_HOST_CPU_steamroller */ +/* #undef HAVE_HOST_CPU_excavator */ +/* #undef HAVE_HOST_CPU_zen */ +/* #undef HAVE_HOST_CPU_bobcat */ +/* #undef HAVE_HOST_CPU_jaguar */ +/* #undef HAVE_HOST_CPU_s390_z900 */ +/* #undef HAVE_HOST_CPU_s390_z990 */ +/* #undef HAVE_HOST_CPU_s390_z9 */ +/* #undef HAVE_HOST_CPU_s390_z10 */ +/* #undef HAVE_HOST_CPU_s390_z196 */ + +/* Define to 1 iff we have a s390 with 64-bit registers. */ +/* #undef HAVE_HOST_CPU_s390_zarch */ + +/* Define to 1 if the system has the type `intmax_t'. */ +#define HAVE_INTMAX_T 1 + +/* Define to 1 if the system has the type `intptr_t'. */ +#define HAVE_INTPTR_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_INVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LANGINFO_H 1 + +/* Define one of these to 1 for the endianness of `mp_limb_t'. + If the endianness is not a simple big or little, or you don't know what + it is, then leave both undefined. */ +/* #undef HAVE_LIMB_BIG_ENDIAN */ +#define HAVE_LIMB_LITTLE_ENDIAN 1 + +/* Define to 1 if you have the `localeconv' function. */ +#define HAVE_LOCALECONV 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if the system has the type `long double'. */ +#define HAVE_LONG_DOUBLE 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_MACHINE_HAL_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memset' function. */ +#define HAVE_MEMSET 1 + +/* Define to 1 if you have the `mmap' function. */ +#define HAVE_MMAP 1 + +/* Define to 1 if you have the `mprotect' function. */ +#define HAVE_MPROTECT 1 + +/* Define to 1 each of the following for which a native (ie. CPU specific) + implementation of the corresponding routine exists. */ +#define HAVE_NATIVE_mpn_add_n 1 +/* #undef HAVE_NATIVE_mpn_add_n_sub_n */ +#define HAVE_NATIVE_mpn_add_nc 1 +#define HAVE_NATIVE_mpn_addaddmul_1msb0 1 +#define HAVE_NATIVE_mpn_addlsh1_n 1 +#define HAVE_NATIVE_mpn_addlsh2_n 1 +#define HAVE_NATIVE_mpn_addlsh_n 1 +/* #undef HAVE_NATIVE_mpn_addlsh1_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addmul_1c */ +#define HAVE_NATIVE_mpn_addmul_2 1 +/* #undef HAVE_NATIVE_mpn_addmul_3 */ +/* #undef HAVE_NATIVE_mpn_addmul_4 */ +/* #undef HAVE_NATIVE_mpn_addmul_5 */ +/* #undef HAVE_NATIVE_mpn_addmul_6 */ +/* #undef HAVE_NATIVE_mpn_addmul_7 */ +/* #undef HAVE_NATIVE_mpn_addmul_8 */ +/* #undef HAVE_NATIVE_mpn_addmul_2s */ +#define HAVE_NATIVE_mpn_and_n 1 +#define HAVE_NATIVE_mpn_andn_n 1 +#define HAVE_NATIVE_mpn_bdiv_dbm1c 1 +#define HAVE_NATIVE_mpn_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_pi1_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_cnd_add_n 1 +#define HAVE_NATIVE_mpn_cnd_sub_n 1 +#define HAVE_NATIVE_mpn_com 1 +#define HAVE_NATIVE_mpn_copyd 1 +#define HAVE_NATIVE_mpn_copyi 1 +#define HAVE_NATIVE_mpn_div_qr_1n_pi1 1 +/* #undef HAVE_NATIVE_mpn_div_qr_2 */ +#define HAVE_NATIVE_mpn_divexact_1 1 +/* #undef HAVE_NATIVE_mpn_divexact_by3c */ +#define HAVE_NATIVE_mpn_divrem_1 1 +/* #undef HAVE_NATIVE_mpn_divrem_1c */ +#define HAVE_NATIVE_mpn_divrem_2 1 +/* #undef HAVE_NATIVE_mpn_gcd_1 */ +#define HAVE_NATIVE_mpn_gcd_11 1 +#define HAVE_NATIVE_mpn_gcd_22 1 +#define HAVE_NATIVE_mpn_hamdist 1 +#define HAVE_NATIVE_mpn_invert_limb 1 +#define HAVE_NATIVE_mpn_ior_n 1 +#define HAVE_NATIVE_mpn_iorn_n 1 +#define HAVE_NATIVE_mpn_lshift 1 +#define HAVE_NATIVE_mpn_lshiftc 1 +/* #undef HAVE_NATIVE_mpn_lshsub_n */ +/* #undef HAVE_NATIVE_mpn_mod_1 */ +#define HAVE_NATIVE_mpn_mod_1_1p 1 +/* #undef HAVE_NATIVE_mpn_mod_1c */ +#define HAVE_NATIVE_mpn_mod_1s_2p 1 +#define HAVE_NATIVE_mpn_mod_1s_4p 1 +#define HAVE_NATIVE_mpn_mod_34lsub1 1 +#define HAVE_NATIVE_mpn_modexact_1_odd 1 +#define HAVE_NATIVE_mpn_modexact_1c_odd 1 +#define HAVE_NATIVE_mpn_mul_1 1 +#define HAVE_NATIVE_mpn_mul_1c 1 +#define HAVE_NATIVE_mpn_mul_2 1 +/* #undef HAVE_NATIVE_mpn_mul_3 */ +/* #undef HAVE_NATIVE_mpn_mul_4 */ +/* #undef HAVE_NATIVE_mpn_mul_5 */ +/* #undef HAVE_NATIVE_mpn_mul_6 */ +#define HAVE_NATIVE_mpn_mul_basecase 1 +#define HAVE_NATIVE_mpn_mullo_basecase 1 +#define HAVE_NATIVE_mpn_nand_n 1 +#define HAVE_NATIVE_mpn_nior_n 1 +#define HAVE_NATIVE_mpn_popcount 1 +#define HAVE_NATIVE_mpn_preinv_divrem_1 1 +/* #undef HAVE_NATIVE_mpn_preinv_mod_1 */ +#define HAVE_NATIVE_mpn_redc_1 1 +/* #undef HAVE_NATIVE_mpn_redc_2 */ +#define HAVE_NATIVE_mpn_rsblsh1_n 1 +#define HAVE_NATIVE_mpn_rsblsh2_n 1 +#define HAVE_NATIVE_mpn_rsblsh_n 1 +/* #undef HAVE_NATIVE_mpn_rsblsh1_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh2_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh_nc */ +#define HAVE_NATIVE_mpn_rsh1add_n 1 +#define HAVE_NATIVE_mpn_rsh1add_nc 1 +#define HAVE_NATIVE_mpn_rsh1sub_n 1 +#define HAVE_NATIVE_mpn_rsh1sub_nc 1 +#define HAVE_NATIVE_mpn_rshift 1 +/* #undef HAVE_NATIVE_mpn_sbpi1_bdiv_r */ +#define HAVE_NATIVE_mpn_sqr_basecase 1 +/* #undef HAVE_NATIVE_mpn_sqr_diagonal */ +#define HAVE_NATIVE_mpn_sqr_diag_addlsh1 1 +#define HAVE_NATIVE_mpn_sub_n 1 +#define HAVE_NATIVE_mpn_sub_nc 1 +#define HAVE_NATIVE_mpn_sublsh1_n 1 +/* #undef HAVE_NATIVE_mpn_sublsh2_n */ +/* #undef HAVE_NATIVE_mpn_sublsh_n */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_submul_1c */ +/* #undef HAVE_NATIVE_mpn_tabselect */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd_r */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm_r */ +#define HAVE_NATIVE_mpn_xor_n 1 +#define HAVE_NATIVE_mpn_xnor_n 1 + +/* Define to 1 if you have the `nl_langinfo' function. */ +#define HAVE_NL_LANGINFO 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_NL_TYPES_H 1 + +/* Define to 1 if you have the `obstack_vprintf' function. */ +/* #define HAVE_OBSTACK_VPRINTF 1 */ + +/* Define to 1 if you have the `popen' function. */ +#define HAVE_POPEN 1 + +/* Define to 1 if you have the `processor_info' function. */ +/* #undef HAVE_PROCESSOR_INFO */ + +/* Define to 1 if `struct pst_processor' exists and contains + `psp_iticksperclktick'. */ +/* #undef HAVE_PSP_ITICKSPERCLKTICK */ + +/* Define to 1 if you have the `pstat_getprocessor' function. */ +/* #undef HAVE_PSTAT_GETPROCESSOR */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if the system has the type `quad_t'. */ +#define HAVE_QUAD_T 1 + +/* Define to 1 if you have the `raise' function. */ +#define HAVE_RAISE 1 + +/* Define to 1 if you have the `read_real_time' function. */ +/* #undef HAVE_READ_REAL_TIME */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `sigaltstack' function. */ +#define HAVE_SIGALTSTACK 1 + +/* Define to 1 if you have the `sigstack' function. */ +#define HAVE_SIGSTACK 1 + +/* Tune directory speed_cyclecounter, undef=none, 1=32bits, 2=64bits) */ +#define HAVE_SPEED_CYCLECOUNTER 2 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SSTREAM */ + +/* Define to 1 if the system has the type `stack_t'. */ +#define HAVE_STACK_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if the system has the type `std::locale'. */ +/* #undef HAVE_STD__LOCALE */ + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strnlen' function. */ +#define HAVE_STRNLEN 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the `sysctl' function. */ +/* #undef HAVE_SYSCTL */ + +/* Define to 1 if you have the `sysctlbyname' function. */ +/* #undef HAVE_SYSCTLBYNAME */ + +/* Define to 1 if you have the `syssgi' function. */ +/* #undef HAVE_SYSSGI */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_ATTRIBUTES_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_IOGRAPH_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_MMAN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PROCESSOR_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PSTAT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSCTL_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSINFO_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSSGI_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSTEMCFG_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIMES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the `times' function. */ +#define HAVE_TIMES 1 + +/* Define to 1 if the system has the type `uint_least32_t'. */ +#define HAVE_UINT_LEAST32_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the `vsnprintf' function and it works properly. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 for Windos/64 */ +/* #undef HOST_DOS64 */ + +/* Assembler local label prefix */ +#define LSYM_PREFIX ".L" + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* Define to 1 to disable the use of inline assembly */ +/* #undef NO_ASM */ + +/* Name of package */ +#define PACKAGE "gmp" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "GNU MP" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "GNU MP 6.2.1" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "gmp" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "http://www.gnu.org/software/gmp/" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "6.2.1" + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* The size of `mp_limb_t', as computed by sizeof. */ +#define SIZEOF_MP_LIMB_T 8 + +/* The size of `unsigned', as computed by sizeof. */ +#define SIZEOF_UNSIGNED 4 + +/* The size of `unsigned long', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_LONG 8 + +/* The size of `unsigned short', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_SHORT 2 + +/* The size of `void *', as computed by sizeof. */ +#define SIZEOF_VOID_P 8 + +/* Define to 1 if sscanf requires writable inputs */ +/* #undef SSCANF_WRITABLE_INPUT */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Maximum size the tune program can test for SQR_TOOM2_THRESHOLD */ +/* #undef TUNE_SQR_TOOM2_MAX */ + +/* Version number of package */ +#define VERSION "6.2.1" + +/* Define to 1 to enable ASSERT checking, per --enable-assert */ +/* #undef WANT_ASSERT */ + +/* Define to 1 to enable GMP_CPU_TYPE faking cpuid, per --enable-fake-cpuid */ +/* #undef WANT_FAKE_CPUID */ + +/* Define to 1 when building a fat binary. */ +/* #undef WANT_FAT_BINARY */ + +/* Define to 1 to enable FFTs for multiplication, per --enable-fft */ +#define WANT_FFT 1 + +/* Define to 1 to enable old mpn_mul_fft_full for multiplication, per + --enable-old-fft-full */ +/* #undef WANT_OLD_FFT_FULL */ + +/* Define to 1 if --enable-profiling=gprof */ +/* #undef WANT_PROFILING_GPROF */ + +/* Define to 1 if --enable-profiling=instrument */ +/* #undef WANT_PROFILING_INSTRUMENT */ + +/* Define to 1 if --enable-profiling=prof */ +/* #undef WANT_PROFILING_PROF */ + +/* Define one of these to 1 for the desired temporary memory allocation + method, per --enable-alloca. */ +#define WANT_TMP_ALLOCA 1 +/* #undef WANT_TMP_REENTRANT */ +/* #undef WANT_TMP_NOTREENTRANT */ +/* #undef WANT_TMP_DEBUG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to 1 if the assembler understands the mulx instruction */ +/* #undef X86_ASM_MULX */ + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +/* #undef YYTEXT_POINTER */ + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to the equivalent of the C99 'restrict' keyword, or to + nothing if this is not supported. Do not define if restrict is + supported directly. */ +#define restrict __restrict +/* Work around a bug in Sun C++: it does not support _Restrict or + __restrict__, even though the corresponding Sun C compiler ends up with + "#define restrict _Restrict" or "#define restrict __restrict__" in the + previous line. Perhaps some future version of Sun C++ will work with + restrict; if so, hopefully it defines __RESTRICT like Sun C does. */ +#if defined __SUNPRO_CC && !defined __RESTRICT +# define _Restrict +# define __restrict__ +#endif + +/* Define to empty if the keyword `volatile' does not work. Warning: valid + code using `volatile' can become incorrect without. Disable with care. */ +/* #undef volatile */ diff --git a/ext/gmp/gen/x86_64-macos/config.h b/ext/gmp/gen/x86_64-macos/config.h new file mode 100644 index 0000000000..1fbed06fdb --- /dev/null +++ b/ext/gmp/gen/x86_64-macos/config.h @@ -0,0 +1,668 @@ +/* config.h. Generated from config.in by configure. */ +/* config.in. Generated from configure.ac by autoheader. */ + +/* + +Copyright 1996-2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. +*/ + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* The gmp-mparam.h file (a string) the tune program should suggest updating. + */ +#define GMP_MPARAM_H_SUGGEST "./mpn/x86_64/skylake/gmp-mparam.h" + +/* Define to 1 if you have the `alarm' function. */ +#define HAVE_ALARM 1 + +/* Define to 1 if alloca() works (via gmp-impl.h). */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#define HAVE_ALLOCA_H 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((const)) */ +#define HAVE_ATTRIBUTE_CONST 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((malloc)) */ +#define HAVE_ATTRIBUTE_MALLOC 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((mode (XX))) + */ +#define HAVE_ATTRIBUTE_MODE 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((noreturn)) */ +#define HAVE_ATTRIBUTE_NORETURN 1 + +/* Define to 1 if you have the `attr_get' function. */ +/* #undef HAVE_ATTR_GET */ + +/* Define to 1 if tests/libtests has calling conventions checking for the CPU + */ +#define HAVE_CALLING_CONVENTIONS 1 + +/* Define to 1 if you have the `clock' function. */ +#define HAVE_CLOCK 1 + +/* Define to 1 if you have the `clock_gettime' function */ +#define HAVE_CLOCK_GETTIME 1 + +/* Define to 1 if you have the `cputime' function. */ +/* #undef HAVE_CPUTIME */ + +/* Define to 1 if you have the declaration of `fgetc', and to 0 if you don't. + */ +#define HAVE_DECL_FGETC 1 + +/* Define to 1 if you have the declaration of `fscanf', and to 0 if you don't. + */ +#define HAVE_DECL_FSCANF 1 + +/* Define to 1 if you have the declaration of `optarg', and to 0 if you don't. + */ +#define HAVE_DECL_OPTARG 1 + +/* Define to 1 if you have the declaration of `sys_errlist', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_ERRLIST 1 + +/* Define to 1 if you have the declaration of `sys_nerr', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_NERR 1 + +/* Define to 1 if you have the declaration of `ungetc', and to 0 if you don't. + */ +#define HAVE_DECL_UNGETC 1 + +/* Define to 1 if you have the declaration of `vfprintf', and to 0 if you + don't. */ +#define HAVE_DECL_VFPRINTF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define one of the following to 1 for the format of a `double'. + If your format is not among these choices, or you don't know what it is, + then leave all undefined. + IEEE_LITTLE_SWAPPED means little endian, but with the two 4-byte halves + swapped, as used by ARM CPUs in little endian mode. */ +/* #undef HAVE_DOUBLE_IEEE_BIG_ENDIAN */ +#define HAVE_DOUBLE_IEEE_LITTLE_ENDIAN 1 +/* #undef HAVE_DOUBLE_IEEE_LITTLE_SWAPPED */ +/* #undef HAVE_DOUBLE_VAX_D */ +/* #undef HAVE_DOUBLE_VAX_G */ +/* #undef HAVE_DOUBLE_CRAY_CFP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FLOAT_H 1 + +/* Define to 1 if you have the `getpagesize' function. */ +#define HAVE_GETPAGESIZE 1 + +/* Define to 1 if you have the `getrusage' function. */ +#define HAVE_GETRUSAGE 1 + +/* Define to 1 if you have the `getsysinfo' function. */ +/* #undef HAVE_GETSYSINFO */ + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((visibility)) + and __attribute__ ((alias)) */ +/* #undef HAVE_HIDDEN_ALIAS */ + +/* Define one of these to 1 for the host CPU family. + If your CPU is not in any of these families, leave all undefined. + For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */ +/* #undef HAVE_HOST_CPU_FAMILY_alpha */ +/* #undef HAVE_HOST_CPU_FAMILY_m68k */ +/* #undef HAVE_HOST_CPU_FAMILY_power */ +/* #undef HAVE_HOST_CPU_FAMILY_powerpc */ +/* #undef HAVE_HOST_CPU_FAMILY_x86 */ +#define HAVE_HOST_CPU_FAMILY_x86_64 1 + +/* Define one of the following to 1 for the host CPU, as per the output of + ./config.guess. If your CPU is not listed here, leave all undefined. */ +/* #undef HAVE_HOST_CPU_alphaev67 */ +/* #undef HAVE_HOST_CPU_alphaev68 */ +/* #undef HAVE_HOST_CPU_alphaev7 */ +/* #undef HAVE_HOST_CPU_m68020 */ +/* #undef HAVE_HOST_CPU_m68030 */ +/* #undef HAVE_HOST_CPU_m68040 */ +/* #undef HAVE_HOST_CPU_m68060 */ +/* #undef HAVE_HOST_CPU_m68360 */ +/* #undef HAVE_HOST_CPU_powerpc604 */ +/* #undef HAVE_HOST_CPU_powerpc604e */ +/* #undef HAVE_HOST_CPU_powerpc750 */ +/* #undef HAVE_HOST_CPU_powerpc7400 */ +/* #undef HAVE_HOST_CPU_supersparc */ +/* #undef HAVE_HOST_CPU_i386 */ +/* #undef HAVE_HOST_CPU_i586 */ +/* #undef HAVE_HOST_CPU_i686 */ +/* #undef HAVE_HOST_CPU_pentium */ +/* #undef HAVE_HOST_CPU_pentiummmx */ +/* #undef HAVE_HOST_CPU_pentiumpro */ +/* #undef HAVE_HOST_CPU_pentium2 */ +/* #undef HAVE_HOST_CPU_pentium3 */ +/* #undef HAVE_HOST_CPU_pentium4 */ +/* #undef HAVE_HOST_CPU_core2 */ +/* #undef HAVE_HOST_CPU_nehalem */ +/* #undef HAVE_HOST_CPU_westmere */ +/* #undef HAVE_HOST_CPU_sandybridge */ +/* #undef HAVE_HOST_CPU_ivybridge */ +/* #undef HAVE_HOST_CPU_haswell */ +/* #undef HAVE_HOST_CPU_broadwell */ +/* #undef HAVE_HOST_CPU_skylake */ +/* #undef HAVE_HOST_CPU_silvermont */ +/* #undef HAVE_HOST_CPU_goldmont */ +/* #undef HAVE_HOST_CPU_k8 */ +/* #undef HAVE_HOST_CPU_k10 */ +/* #undef HAVE_HOST_CPU_bulldozer */ +/* #undef HAVE_HOST_CPU_piledriver */ +/* #undef HAVE_HOST_CPU_steamroller */ +/* #undef HAVE_HOST_CPU_excavator */ +/* #undef HAVE_HOST_CPU_zen */ +/* #undef HAVE_HOST_CPU_bobcat */ +/* #undef HAVE_HOST_CPU_jaguar */ +/* #undef HAVE_HOST_CPU_s390_z900 */ +/* #undef HAVE_HOST_CPU_s390_z990 */ +/* #undef HAVE_HOST_CPU_s390_z9 */ +/* #undef HAVE_HOST_CPU_s390_z10 */ +/* #undef HAVE_HOST_CPU_s390_z196 */ + +/* Define to 1 iff we have a s390 with 64-bit registers. */ +/* #undef HAVE_HOST_CPU_s390_zarch */ + +/* Define to 1 if the system has the type `intmax_t'. */ +#define HAVE_INTMAX_T 1 + +/* Define to 1 if the system has the type `intptr_t'. */ +#define HAVE_INTPTR_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_INVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LANGINFO_H 1 + +/* Define one of these to 1 for the endianness of `mp_limb_t'. + If the endianness is not a simple big or little, or you don't know what + it is, then leave both undefined. */ +/* #undef HAVE_LIMB_BIG_ENDIAN */ +#define HAVE_LIMB_LITTLE_ENDIAN 1 + +/* Define to 1 if you have the `localeconv' function. */ +#define HAVE_LOCALECONV 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if the system has the type `long double'. */ +#define HAVE_LONG_DOUBLE 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_MACHINE_HAL_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memset' function. */ +#define HAVE_MEMSET 1 + +/* Define to 1 if you have the `mmap' function. */ +#define HAVE_MMAP 1 + +/* Define to 1 if you have the `mprotect' function. */ +#define HAVE_MPROTECT 1 + +/* Define to 1 each of the following for which a native (ie. CPU specific) + implementation of the corresponding routine exists. */ +#define HAVE_NATIVE_mpn_add_n 1 +/* #undef HAVE_NATIVE_mpn_add_n_sub_n */ +#define HAVE_NATIVE_mpn_add_nc 1 +#define HAVE_NATIVE_mpn_addaddmul_1msb0 1 +#define HAVE_NATIVE_mpn_addlsh1_n 1 +#define HAVE_NATIVE_mpn_addlsh2_n 1 +#define HAVE_NATIVE_mpn_addlsh_n 1 +#define HAVE_NATIVE_mpn_addlsh1_nc 1 +#define HAVE_NATIVE_mpn_addlsh2_nc 1 +/* #undef HAVE_NATIVE_mpn_addlsh_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addmul_1c */ +#define HAVE_NATIVE_mpn_addmul_2 1 +/* #undef HAVE_NATIVE_mpn_addmul_3 */ +/* #undef HAVE_NATIVE_mpn_addmul_4 */ +/* #undef HAVE_NATIVE_mpn_addmul_5 */ +/* #undef HAVE_NATIVE_mpn_addmul_6 */ +/* #undef HAVE_NATIVE_mpn_addmul_7 */ +/* #undef HAVE_NATIVE_mpn_addmul_8 */ +/* #undef HAVE_NATIVE_mpn_addmul_2s */ +#define HAVE_NATIVE_mpn_and_n 1 +#define HAVE_NATIVE_mpn_andn_n 1 +#define HAVE_NATIVE_mpn_bdiv_dbm1c 1 +#define HAVE_NATIVE_mpn_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_pi1_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_cnd_add_n 1 +#define HAVE_NATIVE_mpn_cnd_sub_n 1 +#define HAVE_NATIVE_mpn_com 1 +#define HAVE_NATIVE_mpn_copyd 1 +#define HAVE_NATIVE_mpn_copyi 1 +#define HAVE_NATIVE_mpn_div_qr_1n_pi1 1 +/* #undef HAVE_NATIVE_mpn_div_qr_2 */ +#define HAVE_NATIVE_mpn_divexact_1 1 +/* #undef HAVE_NATIVE_mpn_divexact_by3c */ +#define HAVE_NATIVE_mpn_divrem_1 1 +/* #undef HAVE_NATIVE_mpn_divrem_1c */ +#define HAVE_NATIVE_mpn_divrem_2 1 +/* #undef HAVE_NATIVE_mpn_gcd_1 */ +#define HAVE_NATIVE_mpn_gcd_11 1 +#define HAVE_NATIVE_mpn_gcd_22 1 +#define HAVE_NATIVE_mpn_hamdist 1 +#define HAVE_NATIVE_mpn_invert_limb 1 +#define HAVE_NATIVE_mpn_ior_n 1 +#define HAVE_NATIVE_mpn_iorn_n 1 +#define HAVE_NATIVE_mpn_lshift 1 +#define HAVE_NATIVE_mpn_lshiftc 1 +/* #undef HAVE_NATIVE_mpn_lshsub_n */ +/* #undef HAVE_NATIVE_mpn_mod_1 */ +#define HAVE_NATIVE_mpn_mod_1_1p 1 +/* #undef HAVE_NATIVE_mpn_mod_1c */ +#define HAVE_NATIVE_mpn_mod_1s_2p 1 +#define HAVE_NATIVE_mpn_mod_1s_4p 1 +#define HAVE_NATIVE_mpn_mod_34lsub1 1 +#define HAVE_NATIVE_mpn_modexact_1_odd 1 +#define HAVE_NATIVE_mpn_modexact_1c_odd 1 +#define HAVE_NATIVE_mpn_mul_1 1 +/* #undef HAVE_NATIVE_mpn_mul_1c */ +#define HAVE_NATIVE_mpn_mul_2 1 +/* #undef HAVE_NATIVE_mpn_mul_3 */ +/* #undef HAVE_NATIVE_mpn_mul_4 */ +/* #undef HAVE_NATIVE_mpn_mul_5 */ +/* #undef HAVE_NATIVE_mpn_mul_6 */ +#define HAVE_NATIVE_mpn_mul_basecase 1 +#define HAVE_NATIVE_mpn_mullo_basecase 1 +#define HAVE_NATIVE_mpn_nand_n 1 +#define HAVE_NATIVE_mpn_nior_n 1 +#define HAVE_NATIVE_mpn_popcount 1 +#define HAVE_NATIVE_mpn_preinv_divrem_1 1 +/* #undef HAVE_NATIVE_mpn_preinv_mod_1 */ +#define HAVE_NATIVE_mpn_redc_1 1 +/* #undef HAVE_NATIVE_mpn_redc_2 */ +#define HAVE_NATIVE_mpn_rsblsh1_n 1 +#define HAVE_NATIVE_mpn_rsblsh2_n 1 +#define HAVE_NATIVE_mpn_rsblsh_n 1 +#define HAVE_NATIVE_mpn_rsblsh1_nc 1 +/* #undef HAVE_NATIVE_mpn_rsblsh2_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh_nc */ +#define HAVE_NATIVE_mpn_rsh1add_n 1 +#define HAVE_NATIVE_mpn_rsh1add_nc 1 +#define HAVE_NATIVE_mpn_rsh1sub_n 1 +#define HAVE_NATIVE_mpn_rsh1sub_nc 1 +#define HAVE_NATIVE_mpn_rshift 1 +/* #undef HAVE_NATIVE_mpn_sbpi1_bdiv_r */ +#define HAVE_NATIVE_mpn_sqr_basecase 1 +/* #undef HAVE_NATIVE_mpn_sqr_diagonal */ +#define HAVE_NATIVE_mpn_sqr_diag_addlsh1 1 +#define HAVE_NATIVE_mpn_sub_n 1 +#define HAVE_NATIVE_mpn_sub_nc 1 +#define HAVE_NATIVE_mpn_sublsh1_n 1 +#define HAVE_NATIVE_mpn_sublsh2_n 1 +/* #undef HAVE_NATIVE_mpn_sublsh_n */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_submul_1c */ +/* #undef HAVE_NATIVE_mpn_tabselect */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd_r */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm_r */ +#define HAVE_NATIVE_mpn_xor_n 1 +#define HAVE_NATIVE_mpn_xnor_n 1 + +/* Define to 1 if you have the `nl_langinfo' function. */ +#define HAVE_NL_LANGINFO 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_NL_TYPES_H 1 + +/* Define to 1 if you have the `obstack_vprintf' function. */ +/* #undef HAVE_OBSTACK_VPRINTF */ + +/* Define to 1 if you have the `popen' function. */ +#define HAVE_POPEN 1 + +/* Define to 1 if you have the `processor_info' function. */ +#define HAVE_PROCESSOR_INFO 1 + +/* Define to 1 if `struct pst_processor' exists and contains + `psp_iticksperclktick'. */ +/* #undef HAVE_PSP_ITICKSPERCLKTICK */ + +/* Define to 1 if you have the `pstat_getprocessor' function. */ +/* #undef HAVE_PSTAT_GETPROCESSOR */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if the system has the type `quad_t'. */ +#define HAVE_QUAD_T 1 + +/* Define to 1 if you have the `raise' function. */ +#define HAVE_RAISE 1 + +/* Define to 1 if you have the `read_real_time' function. */ +/* #undef HAVE_READ_REAL_TIME */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `sigaltstack' function. */ +#define HAVE_SIGALTSTACK 1 + +/* Define to 1 if you have the `sigstack' function. */ +/* #undef HAVE_SIGSTACK */ + +/* Tune directory speed_cyclecounter, undef=none, 1=32bits, 2=64bits) */ +#define HAVE_SPEED_CYCLECOUNTER 2 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SSTREAM */ + +/* Define to 1 if the system has the type `stack_t'. */ +#define HAVE_STACK_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if the system has the type `std::locale'. */ +/* #undef HAVE_STD__LOCALE */ + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strnlen' function. */ +#define HAVE_STRNLEN 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the `sysctl' function. */ +#define HAVE_SYSCTL 1 + +/* Define to 1 if you have the `sysctlbyname' function. */ +#define HAVE_SYSCTLBYNAME 1 + +/* Define to 1 if you have the `syssgi' function. */ +/* #undef HAVE_SYSSGI */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_ATTRIBUTES_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_IOGRAPH_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_MMAN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PROCESSOR_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PSTAT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSCTL_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSSGI_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSTEMCFG_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIMES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the `times' function. */ +#define HAVE_TIMES 1 + +/* Define to 1 if the system has the type `uint_least32_t'. */ +#define HAVE_UINT_LEAST32_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the `vsnprintf' function and it works properly. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 for Windos/64 */ +/* #undef HOST_DOS64 */ + +/* Assembler local label prefix */ +#define LSYM_PREFIX "L" + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* Define to 1 to disable the use of inline assembly */ +/* #undef NO_ASM */ + +/* Name of package */ +#define PACKAGE "gmp" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "GNU MP" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "GNU MP 6.2.1" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "gmp" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "http://www.gnu.org/software/gmp/" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "6.2.1" + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* The size of `mp_limb_t', as computed by sizeof. */ +#define SIZEOF_MP_LIMB_T 8 + +/* The size of `unsigned', as computed by sizeof. */ +#define SIZEOF_UNSIGNED 4 + +/* The size of `unsigned long', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_LONG 8 + +/* The size of `unsigned short', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_SHORT 2 + +/* The size of `void *', as computed by sizeof. */ +#define SIZEOF_VOID_P 8 + +/* Define to 1 if sscanf requires writable inputs */ +/* #undef SSCANF_WRITABLE_INPUT */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Maximum size the tune program can test for SQR_TOOM2_THRESHOLD */ +/* #undef TUNE_SQR_TOOM2_MAX */ + +/* Version number of package */ +#define VERSION "6.2.1" + +/* Define to 1 to enable ASSERT checking, per --enable-assert */ +/* #undef WANT_ASSERT */ + +/* Define to 1 to enable GMP_CPU_TYPE faking cpuid, per --enable-fake-cpuid */ +/* #undef WANT_FAKE_CPUID */ + +/* Define to 1 when building a fat binary. */ +/* #undef WANT_FAT_BINARY */ + +/* Define to 1 to enable FFTs for multiplication, per --enable-fft */ +#define WANT_FFT 1 + +/* Define to 1 to enable old mpn_mul_fft_full for multiplication, per + --enable-old-fft-full */ +/* #undef WANT_OLD_FFT_FULL */ + +/* Define to 1 if --enable-profiling=gprof */ +/* #undef WANT_PROFILING_GPROF */ + +/* Define to 1 if --enable-profiling=instrument */ +/* #undef WANT_PROFILING_INSTRUMENT */ + +/* Define to 1 if --enable-profiling=prof */ +/* #undef WANT_PROFILING_PROF */ + +/* Define one of these to 1 for the desired temporary memory allocation + method, per --enable-alloca. */ +#define WANT_TMP_ALLOCA 1 +/* #undef WANT_TMP_REENTRANT */ +/* #undef WANT_TMP_NOTREENTRANT */ +/* #undef WANT_TMP_DEBUG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to 1 if the assembler understands the mulx instruction */ +#define X86_ASM_MULX 1 + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +#define YYTEXT_POINTER 1 + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to the equivalent of the C99 'restrict' keyword, or to + nothing if this is not supported. Do not define if restrict is + supported directly. */ +#define restrict __restrict +/* Work around a bug in Sun C++: it does not support _Restrict or + __restrict__, even though the corresponding Sun C compiler ends up with + "#define restrict _Restrict" or "#define restrict __restrict__" in the + previous line. Perhaps some future version of Sun C++ will work with + restrict; if so, hopefully it defines __RESTRICT like Sun C does. */ +#if defined __SUNPRO_CC && !defined __RESTRICT +# define _Restrict +# define __restrict__ +#endif + +/* Define to empty if the keyword `volatile' does not work. Warning: valid + code using `volatile' can become incorrect without. Disable with care. */ +/* #undef volatile */ From 6453dafa461cd28799192343f796ddf97ea214d7 Mon Sep 17 00:00:00 2001 From: Santeri Hannula Date: Thu, 10 Oct 2024 19:38:55 +0300 Subject: [PATCH 88/97] ci: update path filters --- .github/workflows/develop.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml index 6f9acb4cb0..36aedc0d60 100644 --- a/.github/workflows/develop.yml +++ b/.github/workflows/develop.yml @@ -5,13 +5,13 @@ on: branches: - develop paths: - - '*.bazel' - - '.bazelrc' - - '.github/workflows/**.yml' - - 'PACE' - - 'VERSION' - - 'bazel/**' + - 'build.zig' + - 'build.zig.zon' + - 'ext/**' + - '!ext/**.md' - 'pkg/**' + - '.github/workflows/**.yml' + - '*.sh' jobs: urbit: From a5f81863a8e1427632447f4e7b851e270eae63cc Mon Sep 17 00:00:00 2001 From: Santeri Hannula Date: Fri, 11 Oct 2024 12:23:22 +0300 Subject: [PATCH 89/97] docs: add configure opts to gmp readme --- ext/gmp/gen/README.md | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/ext/gmp/gen/README.md b/ext/gmp/gen/README.md index 0a9dab4266..836e13e2da 100644 --- a/ext/gmp/gen/README.md +++ b/ext/gmp/gen/README.md @@ -1,8 +1,23 @@ # Generated architecture specific `.c`, `.s`, and `.h` files -To generate these, first run the `./configure` script under the unpacked GMP -dependency directory. Afterwards, navigate under `mpn/` and run the the -following to generate the assembly files: +To generate these, first run the `./configure` script under the unpacked GMP dependency directory with the following options: + +maoOS: +```terminal +./configure --with-pic --disable-shared +``` + +linux-x86_64: +```terminal +./configure --with-pic --disable-shared --host=x86_64-linux-musl +``` + +linux-aarch64: +```terminal +./configure --with-pic --disable-shared --host=aarch64-linux-musl +``` + +Next, navigate under `mpn/` and run the following to generate the assembly files: ```bash for file in $(find . -maxdepth 1 -print | grep "\.asm$"); do From fedec585743de76cecf26e0cdf5201df966bfeb6 Mon Sep 17 00:00:00 2001 From: Pyry Kovanen Date: Fri, 11 Oct 2024 13:23:28 +0300 Subject: [PATCH 90/97] gmp: fix readme typo --- ext/gmp/gen/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/gmp/gen/README.md b/ext/gmp/gen/README.md index 836e13e2da..6228292b1b 100644 --- a/ext/gmp/gen/README.md +++ b/ext/gmp/gen/README.md @@ -2,7 +2,7 @@ To generate these, first run the `./configure` script under the unpacked GMP dependency directory with the following options: -maoOS: +macOS: ```terminal ./configure --with-pic --disable-shared ``` From 0a26ad082c7178795506225fb9ec129690effa98 Mon Sep 17 00:00:00 2001 From: pkova Date: Thu, 17 Oct 2024 11:30:29 +0000 Subject: [PATCH 91/97] build: upgrade gmp to 6.3.0 --- ext/gmp/build.zig | 1 + ext/gmp/build.zig.zon | 4 +-- ext/gmp/gen/aarch64-linux/config.h | 20 ++++++----- ext/gmp/gen/aarch64-linux/sieve_table.h | 46 +++++++++++++++++++++++++ ext/gmp/gen/x86_64-linux/sieve_table.h | 46 +++++++++++++++++++++++++ 5 files changed, 107 insertions(+), 10 deletions(-) create mode 100644 ext/gmp/gen/aarch64-linux/sieve_table.h create mode 100644 ext/gmp/gen/x86_64-linux/sieve_table.h diff --git a/ext/gmp/build.zig b/ext/gmp/build.zig index 2444000e3d..576e18aa7a 100644 --- a/ext/gmp/build.zig +++ b/ext/gmp/build.zig @@ -563,6 +563,7 @@ const generic_c_sources = [_][]const u8{ "mpn/generic/mulmid_basecase.c", "mpn/generic/mulmid_n.c", "mpn/generic/mulmod_bnm1.c", + "mpn/generic/mulmod_bknp1.c", "mpn/generic/neg.c", "mpn/generic/nussbaumer_mul.c", "mpn/generic/perfpow.c", diff --git a/ext/gmp/build.zig.zon b/ext/gmp/build.zig.zon index 3349cef8a6..47538723d8 100644 --- a/ext/gmp/build.zig.zon +++ b/ext/gmp/build.zig.zon @@ -3,8 +3,8 @@ .version = "0.0.1", .dependencies = .{ .gmp = .{ - .url = "https://github.com/alisw/GMP/archive/refs/tags/v6.2.1.tar.gz", - .hash = "12209dd340fd48ad775604d2d4e95155dcf106b8f6c63dd054641d606e2007d806f4", + .url = "https://ftp.gnu.org/gnu/gmp/gmp-6.3.0.tar.gz", + .hash = "1220d46202c17aa35ab5848a7f7a812b797c9f07698f263c8a02b4ad9640a1bbe0e3", }, }, .paths = .{ diff --git a/ext/gmp/gen/aarch64-linux/config.h b/ext/gmp/gen/aarch64-linux/config.h index d2e56c8c54..ad9547c325 100644 --- a/ext/gmp/gen/aarch64-linux/config.h +++ b/ext/gmp/gen/aarch64-linux/config.h @@ -3,7 +3,7 @@ /* -Copyright 1996-2020 Free Software Foundation, Inc. +Copyright 1996-2022 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -187,6 +187,7 @@ see https://www.gnu.org/licenses/. /* #undef HAVE_HOST_CPU_skylake */ /* #undef HAVE_HOST_CPU_silvermont */ /* #undef HAVE_HOST_CPU_goldmont */ +/* #undef HAVE_HOST_CPU_tremont */ /* #undef HAVE_HOST_CPU_k8 */ /* #undef HAVE_HOST_CPU_k10 */ /* #undef HAVE_HOST_CPU_bulldozer */ @@ -201,6 +202,9 @@ see https://www.gnu.org/licenses/. /* #undef HAVE_HOST_CPU_s390_z9 */ /* #undef HAVE_HOST_CPU_s390_z10 */ /* #undef HAVE_HOST_CPU_s390_z196 */ +/* #undef HAVE_HOST_CPU_s390_z13 */ +/* #undef HAVE_HOST_CPU_s390_z14 */ +/* #undef HAVE_HOST_CPU_s390_z15 */ /* Define to 1 iff we have a s390 with 64-bit registers. */ /* #undef HAVE_HOST_CPU_s390_zarch */ @@ -300,7 +304,7 @@ see https://www.gnu.org/licenses/. /* #undef HAVE_NATIVE_mpn_div_qr_2 */ /* #undef HAVE_NATIVE_mpn_divexact_1 */ /* #undef HAVE_NATIVE_mpn_divexact_by3c */ -/* #undef HAVE_NATIVE_mpn_divrem_1 */ +#define HAVE_NATIVE_mpn_divrem_1 1 /* #undef HAVE_NATIVE_mpn_divrem_1c */ /* #undef HAVE_NATIVE_mpn_divrem_2 */ /* #undef HAVE_NATIVE_mpn_gcd_1 */ @@ -333,7 +337,7 @@ see https://www.gnu.org/licenses/. #define HAVE_NATIVE_mpn_nand_n 1 #define HAVE_NATIVE_mpn_nior_n 1 #define HAVE_NATIVE_mpn_popcount 1 -/* #undef HAVE_NATIVE_mpn_preinv_divrem_1 */ +#define HAVE_NATIVE_mpn_preinv_divrem_1 1 /* #undef HAVE_NATIVE_mpn_preinv_mod_1 */ /* #undef HAVE_NATIVE_mpn_redc_1 */ /* #undef HAVE_NATIVE_mpn_redc_2 */ @@ -382,7 +386,7 @@ see https://www.gnu.org/licenses/. #define HAVE_NL_TYPES_H 1 /* Define to 1 if you have the `obstack_vprintf' function. */ -/* #define HAVE_OBSTACK_VPRINTF 1 */ +/* #undef HAVE_OBSTACK_VPRINTF 1 */ /* Define to 1 if you have the `popen' function. */ #define HAVE_POPEN 1 @@ -542,13 +546,13 @@ see https://www.gnu.org/licenses/. #define PACKAGE "gmp" /* Define to the address where bug reports for this package should be sent. */ -#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html" +#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org (see https://gmplib.org/manual/Reporting-Bugs.html)" /* Define to the full name of this package. */ #define PACKAGE_NAME "GNU MP" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "GNU MP 6.2.1" +#define PACKAGE_STRING "GNU MP 6.3.0" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "gmp" @@ -557,7 +561,7 @@ see https://www.gnu.org/licenses/. #define PACKAGE_URL "http://www.gnu.org/software/gmp/" /* Define to the version of this package. */ -#define PACKAGE_VERSION "6.2.1" +#define PACKAGE_VERSION "6.3.0" /* Define as the return type of signal handlers (`int' or `void'). */ #define RETSIGTYPE void @@ -590,7 +594,7 @@ see https://www.gnu.org/licenses/. #define TUNE_SQR_TOOM2_MAX SQR_TOOM2_MAX_GENERIC /* Version number of package */ -#define VERSION "6.2.1" +#define VERSION "6.3.0" /* Define to 1 to enable ASSERT checking, per --enable-assert */ /* #undef WANT_ASSERT */ diff --git a/ext/gmp/gen/aarch64-linux/sieve_table.h b/ext/gmp/gen/aarch64-linux/sieve_table.h new file mode 100644 index 0000000000..ee9ac14360 --- /dev/null +++ b/ext/gmp/gen/aarch64-linux/sieve_table.h @@ -0,0 +1,46 @@ +/* This file generated by gen-sieve.c - DO NOT EDIT. */ + +#if GMP_LIMB_BITS != 64 +Error, error, this data is for 64 bits +#endif + +#define PRIMESIEVE_INIT_TABLE \ + CNST_LIMB (0x3294C9E069128480), /* 5 - 196 (42 primes) */ \ + CNST_LIMB (0x95A35E1EC4AB21DC), /* 197 - 388 (32 primes) */ \ + CNST_LIMB (0x4AD7CE99B8693366), /* 389 - 580 (30 primes) */ \ + CNST_LIMB (0x6595B6DA728DC52B), /* 581 - 772 (30 primes) */ \ + CNST_LIMB (0xEA6D9F8787B0CEDE), /* 773 - 964 (26 primes) */ \ + CNST_LIMB (0x3F56A1F4CD3275A9), /* 965 - 1156 (29 primes) */ \ + CNST_LIMB (0xFD3848FB74A76ADB), /* 1157 - 1348 (26 primes) */ \ + CNST_LIMB (0xDBBA0DD1A1EDF6AF), /* 1349 - 1540 (25 primes) */ \ + CNST_LIMB (0xCEC7F17ED22799A5), /* 1541 - 1732 (27 primes) */ \ + CNST_LIMB (0xEAEC17BDBB717D56), /* 1733 - 1924 (24 primes) */ \ + CNST_LIMB (0x3B0EB7B3585AFCF3), /* 1925 - 2116 (26 primes) */ \ + CNST_LIMB (0xE563D8F69FDF6C4F), /* 2117 - 2308 (23 primes) */ \ + CNST_LIMB (0xFE5BA7ABA45E92FC), /* 2309 - 2500 (25 primes) */ \ + CNST_LIMB (0x158DEE6F3BF49B7D), /* 2501 - 2692 (24 primes) */ \ + CNST_LIMB (0xBE5A7BC4EDE6CD1A), /* 2693 - 2884 (26 primes) */ \ + CNST_LIMB (0xD7679B3FCA7BB6AD), /* 2885 - 3076 (22 primes) */ \ + CNST_LIMB (0xC3F66B971FEF37E9), /* 3077 - 3268 (22 primes) */ \ + CNST_LIMB (0x6F7EBCF339C953FD), /* 3269 - 3460 (22 primes) */ \ + CNST_LIMB (0xD5A5ECDCD235DBF0), /* 3461 - 3652 (27 primes) */ \ + CNST_LIMB (0xECFA7B2FD5B65E3B), /* 3653 - 3844 (22 primes) */ \ + CNST_LIMB (0xD28EFDF9C89F67B1), /* 3845 - 4036 (25 primes) */ \ + CNST_LIMB (0xCB7F7C7A3DD3AF4F), /* 4037 - 4228 (21 primes) */ \ + CNST_LIMB (0xEEBED6CDFF6B32CC), /* 4229 - 4420 (22 primes) */ \ + CNST_LIMB (0xD5BD73F85ECFA97C), /* 4421 - 4612 (23 primes) */ \ + CNST_LIMB (0x21FDBE4FBBAD48F7), /* 4613 - 4804 (24 primes) */ \ + CNST_LIMB (0x5E35A3B5EEB7FDE7), /* 4805 - 4996 (21 primes) */ \ + CNST_LIMB (0xD9EBFD53A7DBBCC9), /* 4997 - 5188 (22 primes) */ \ + CNST_LIMB (0xFF9EDEAF2EFE1F76), /* 5189 - 5380 (18 primes) */ +#define PRIMESIEVE_NUMBEROF_TABLE 28 +/* #define PRIMESIEVE_PRIMES_IN_TABLE 706 */ +#define PRIMESIEVE_HIGHEST_PRIME 5351 +/* #define PRIMESIEVE_FIRST_UNCHECKED 5381 */ + +#define SIEVE_MASK1 CNST_LIMB(0x3204C1A049120485) +#define SIEVE_MASKT CNST_LIMB(0xA1204892058) +#define SIEVE_2MSK1 CNST_LIMB(0x29048402110840A) +#define SIEVE_2MSK2 CNST_LIMB(0x9402180C40230184) +#define SIEVE_2MSKT CNST_LIMB(0x5021088402120) + diff --git a/ext/gmp/gen/x86_64-linux/sieve_table.h b/ext/gmp/gen/x86_64-linux/sieve_table.h new file mode 100644 index 0000000000..ee9ac14360 --- /dev/null +++ b/ext/gmp/gen/x86_64-linux/sieve_table.h @@ -0,0 +1,46 @@ +/* This file generated by gen-sieve.c - DO NOT EDIT. */ + +#if GMP_LIMB_BITS != 64 +Error, error, this data is for 64 bits +#endif + +#define PRIMESIEVE_INIT_TABLE \ + CNST_LIMB (0x3294C9E069128480), /* 5 - 196 (42 primes) */ \ + CNST_LIMB (0x95A35E1EC4AB21DC), /* 197 - 388 (32 primes) */ \ + CNST_LIMB (0x4AD7CE99B8693366), /* 389 - 580 (30 primes) */ \ + CNST_LIMB (0x6595B6DA728DC52B), /* 581 - 772 (30 primes) */ \ + CNST_LIMB (0xEA6D9F8787B0CEDE), /* 773 - 964 (26 primes) */ \ + CNST_LIMB (0x3F56A1F4CD3275A9), /* 965 - 1156 (29 primes) */ \ + CNST_LIMB (0xFD3848FB74A76ADB), /* 1157 - 1348 (26 primes) */ \ + CNST_LIMB (0xDBBA0DD1A1EDF6AF), /* 1349 - 1540 (25 primes) */ \ + CNST_LIMB (0xCEC7F17ED22799A5), /* 1541 - 1732 (27 primes) */ \ + CNST_LIMB (0xEAEC17BDBB717D56), /* 1733 - 1924 (24 primes) */ \ + CNST_LIMB (0x3B0EB7B3585AFCF3), /* 1925 - 2116 (26 primes) */ \ + CNST_LIMB (0xE563D8F69FDF6C4F), /* 2117 - 2308 (23 primes) */ \ + CNST_LIMB (0xFE5BA7ABA45E92FC), /* 2309 - 2500 (25 primes) */ \ + CNST_LIMB (0x158DEE6F3BF49B7D), /* 2501 - 2692 (24 primes) */ \ + CNST_LIMB (0xBE5A7BC4EDE6CD1A), /* 2693 - 2884 (26 primes) */ \ + CNST_LIMB (0xD7679B3FCA7BB6AD), /* 2885 - 3076 (22 primes) */ \ + CNST_LIMB (0xC3F66B971FEF37E9), /* 3077 - 3268 (22 primes) */ \ + CNST_LIMB (0x6F7EBCF339C953FD), /* 3269 - 3460 (22 primes) */ \ + CNST_LIMB (0xD5A5ECDCD235DBF0), /* 3461 - 3652 (27 primes) */ \ + CNST_LIMB (0xECFA7B2FD5B65E3B), /* 3653 - 3844 (22 primes) */ \ + CNST_LIMB (0xD28EFDF9C89F67B1), /* 3845 - 4036 (25 primes) */ \ + CNST_LIMB (0xCB7F7C7A3DD3AF4F), /* 4037 - 4228 (21 primes) */ \ + CNST_LIMB (0xEEBED6CDFF6B32CC), /* 4229 - 4420 (22 primes) */ \ + CNST_LIMB (0xD5BD73F85ECFA97C), /* 4421 - 4612 (23 primes) */ \ + CNST_LIMB (0x21FDBE4FBBAD48F7), /* 4613 - 4804 (24 primes) */ \ + CNST_LIMB (0x5E35A3B5EEB7FDE7), /* 4805 - 4996 (21 primes) */ \ + CNST_LIMB (0xD9EBFD53A7DBBCC9), /* 4997 - 5188 (22 primes) */ \ + CNST_LIMB (0xFF9EDEAF2EFE1F76), /* 5189 - 5380 (18 primes) */ +#define PRIMESIEVE_NUMBEROF_TABLE 28 +/* #define PRIMESIEVE_PRIMES_IN_TABLE 706 */ +#define PRIMESIEVE_HIGHEST_PRIME 5351 +/* #define PRIMESIEVE_FIRST_UNCHECKED 5381 */ + +#define SIEVE_MASK1 CNST_LIMB(0x3204C1A049120485) +#define SIEVE_MASKT CNST_LIMB(0xA1204892058) +#define SIEVE_2MSK1 CNST_LIMB(0x29048402110840A) +#define SIEVE_2MSK2 CNST_LIMB(0x9402180C40230184) +#define SIEVE_2MSKT CNST_LIMB(0x5021088402120) + From d0e666577518654c80c98ec7e26e3606db0bc17d Mon Sep 17 00:00:00 2001 From: pkova Date: Thu, 17 Oct 2024 11:30:29 +0000 Subject: [PATCH 92/97] build: upgrade gmp to 6.3.0 --- ext/gmp/build.zig | 1 + ext/gmp/build.zig.zon | 4 +- ext/gmp/gen/aarch64-linux/config.h | 20 ++++--- ext/gmp/gen/aarch64-linux/sieve_table.h | 46 ++++++++++++++++ ext/gmp/gen/aarch64-macos/mpn/add_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/addlsh1_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/addlsh2_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/addmul_1.s | 5 ++ ext/gmp/gen/aarch64-macos/mpn/and_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/andn_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/cnd_add_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/cnd_sub_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/com.s | 52 +++++++++++-------- ext/gmp/gen/aarch64-macos/mpn/copyd.s | 42 ++++++--------- ext/gmp/gen/aarch64-macos/mpn/copyi.s | 28 +++++----- ext/gmp/gen/aarch64-macos/mpn/ior_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/iorn_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/lshift.s | 10 ++-- ext/gmp/gen/aarch64-macos/mpn/lshiftc.s | 10 ++-- ext/gmp/gen/aarch64-macos/mpn/mul_1.s | 15 +++--- ext/gmp/gen/aarch64-macos/mpn/nand_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/nior_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/rsblsh1_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/rsblsh2_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/rsh1add_n.s | 20 +++---- ext/gmp/gen/aarch64-macos/mpn/rsh1sub_n.s | 20 +++---- ext/gmp/gen/aarch64-macos/mpn/rshift.s | 10 ++-- .../gen/aarch64-macos/mpn/sqr_diag_addlsh1.s | 10 ++-- ext/gmp/gen/aarch64-macos/mpn/sub_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/sublsh1_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/sublsh2_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/submul_1.s | 5 ++ ext/gmp/gen/aarch64-macos/mpn/xnor_n.s | 12 ++--- ext/gmp/gen/aarch64-macos/mpn/xor_n.s | 12 ++--- ext/gmp/gen/x86_64-linux/sieve_table.h | 46 ++++++++++++++++ 35 files changed, 336 insertions(+), 224 deletions(-) create mode 100644 ext/gmp/gen/aarch64-linux/sieve_table.h create mode 100644 ext/gmp/gen/x86_64-linux/sieve_table.h diff --git a/ext/gmp/build.zig b/ext/gmp/build.zig index 2444000e3d..576e18aa7a 100644 --- a/ext/gmp/build.zig +++ b/ext/gmp/build.zig @@ -563,6 +563,7 @@ const generic_c_sources = [_][]const u8{ "mpn/generic/mulmid_basecase.c", "mpn/generic/mulmid_n.c", "mpn/generic/mulmod_bnm1.c", + "mpn/generic/mulmod_bknp1.c", "mpn/generic/neg.c", "mpn/generic/nussbaumer_mul.c", "mpn/generic/perfpow.c", diff --git a/ext/gmp/build.zig.zon b/ext/gmp/build.zig.zon index 3349cef8a6..47538723d8 100644 --- a/ext/gmp/build.zig.zon +++ b/ext/gmp/build.zig.zon @@ -3,8 +3,8 @@ .version = "0.0.1", .dependencies = .{ .gmp = .{ - .url = "https://github.com/alisw/GMP/archive/refs/tags/v6.2.1.tar.gz", - .hash = "12209dd340fd48ad775604d2d4e95155dcf106b8f6c63dd054641d606e2007d806f4", + .url = "https://ftp.gnu.org/gnu/gmp/gmp-6.3.0.tar.gz", + .hash = "1220d46202c17aa35ab5848a7f7a812b797c9f07698f263c8a02b4ad9640a1bbe0e3", }, }, .paths = .{ diff --git a/ext/gmp/gen/aarch64-linux/config.h b/ext/gmp/gen/aarch64-linux/config.h index d2e56c8c54..ad9547c325 100644 --- a/ext/gmp/gen/aarch64-linux/config.h +++ b/ext/gmp/gen/aarch64-linux/config.h @@ -3,7 +3,7 @@ /* -Copyright 1996-2020 Free Software Foundation, Inc. +Copyright 1996-2022 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -187,6 +187,7 @@ see https://www.gnu.org/licenses/. /* #undef HAVE_HOST_CPU_skylake */ /* #undef HAVE_HOST_CPU_silvermont */ /* #undef HAVE_HOST_CPU_goldmont */ +/* #undef HAVE_HOST_CPU_tremont */ /* #undef HAVE_HOST_CPU_k8 */ /* #undef HAVE_HOST_CPU_k10 */ /* #undef HAVE_HOST_CPU_bulldozer */ @@ -201,6 +202,9 @@ see https://www.gnu.org/licenses/. /* #undef HAVE_HOST_CPU_s390_z9 */ /* #undef HAVE_HOST_CPU_s390_z10 */ /* #undef HAVE_HOST_CPU_s390_z196 */ +/* #undef HAVE_HOST_CPU_s390_z13 */ +/* #undef HAVE_HOST_CPU_s390_z14 */ +/* #undef HAVE_HOST_CPU_s390_z15 */ /* Define to 1 iff we have a s390 with 64-bit registers. */ /* #undef HAVE_HOST_CPU_s390_zarch */ @@ -300,7 +304,7 @@ see https://www.gnu.org/licenses/. /* #undef HAVE_NATIVE_mpn_div_qr_2 */ /* #undef HAVE_NATIVE_mpn_divexact_1 */ /* #undef HAVE_NATIVE_mpn_divexact_by3c */ -/* #undef HAVE_NATIVE_mpn_divrem_1 */ +#define HAVE_NATIVE_mpn_divrem_1 1 /* #undef HAVE_NATIVE_mpn_divrem_1c */ /* #undef HAVE_NATIVE_mpn_divrem_2 */ /* #undef HAVE_NATIVE_mpn_gcd_1 */ @@ -333,7 +337,7 @@ see https://www.gnu.org/licenses/. #define HAVE_NATIVE_mpn_nand_n 1 #define HAVE_NATIVE_mpn_nior_n 1 #define HAVE_NATIVE_mpn_popcount 1 -/* #undef HAVE_NATIVE_mpn_preinv_divrem_1 */ +#define HAVE_NATIVE_mpn_preinv_divrem_1 1 /* #undef HAVE_NATIVE_mpn_preinv_mod_1 */ /* #undef HAVE_NATIVE_mpn_redc_1 */ /* #undef HAVE_NATIVE_mpn_redc_2 */ @@ -382,7 +386,7 @@ see https://www.gnu.org/licenses/. #define HAVE_NL_TYPES_H 1 /* Define to 1 if you have the `obstack_vprintf' function. */ -/* #define HAVE_OBSTACK_VPRINTF 1 */ +/* #undef HAVE_OBSTACK_VPRINTF 1 */ /* Define to 1 if you have the `popen' function. */ #define HAVE_POPEN 1 @@ -542,13 +546,13 @@ see https://www.gnu.org/licenses/. #define PACKAGE "gmp" /* Define to the address where bug reports for this package should be sent. */ -#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html" +#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org (see https://gmplib.org/manual/Reporting-Bugs.html)" /* Define to the full name of this package. */ #define PACKAGE_NAME "GNU MP" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "GNU MP 6.2.1" +#define PACKAGE_STRING "GNU MP 6.3.0" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "gmp" @@ -557,7 +561,7 @@ see https://www.gnu.org/licenses/. #define PACKAGE_URL "http://www.gnu.org/software/gmp/" /* Define to the version of this package. */ -#define PACKAGE_VERSION "6.2.1" +#define PACKAGE_VERSION "6.3.0" /* Define as the return type of signal handlers (`int' or `void'). */ #define RETSIGTYPE void @@ -590,7 +594,7 @@ see https://www.gnu.org/licenses/. #define TUNE_SQR_TOOM2_MAX SQR_TOOM2_MAX_GENERIC /* Version number of package */ -#define VERSION "6.2.1" +#define VERSION "6.3.0" /* Define to 1 to enable ASSERT checking, per --enable-assert */ /* #undef WANT_ASSERT */ diff --git a/ext/gmp/gen/aarch64-linux/sieve_table.h b/ext/gmp/gen/aarch64-linux/sieve_table.h new file mode 100644 index 0000000000..ee9ac14360 --- /dev/null +++ b/ext/gmp/gen/aarch64-linux/sieve_table.h @@ -0,0 +1,46 @@ +/* This file generated by gen-sieve.c - DO NOT EDIT. */ + +#if GMP_LIMB_BITS != 64 +Error, error, this data is for 64 bits +#endif + +#define PRIMESIEVE_INIT_TABLE \ + CNST_LIMB (0x3294C9E069128480), /* 5 - 196 (42 primes) */ \ + CNST_LIMB (0x95A35E1EC4AB21DC), /* 197 - 388 (32 primes) */ \ + CNST_LIMB (0x4AD7CE99B8693366), /* 389 - 580 (30 primes) */ \ + CNST_LIMB (0x6595B6DA728DC52B), /* 581 - 772 (30 primes) */ \ + CNST_LIMB (0xEA6D9F8787B0CEDE), /* 773 - 964 (26 primes) */ \ + CNST_LIMB (0x3F56A1F4CD3275A9), /* 965 - 1156 (29 primes) */ \ + CNST_LIMB (0xFD3848FB74A76ADB), /* 1157 - 1348 (26 primes) */ \ + CNST_LIMB (0xDBBA0DD1A1EDF6AF), /* 1349 - 1540 (25 primes) */ \ + CNST_LIMB (0xCEC7F17ED22799A5), /* 1541 - 1732 (27 primes) */ \ + CNST_LIMB (0xEAEC17BDBB717D56), /* 1733 - 1924 (24 primes) */ \ + CNST_LIMB (0x3B0EB7B3585AFCF3), /* 1925 - 2116 (26 primes) */ \ + CNST_LIMB (0xE563D8F69FDF6C4F), /* 2117 - 2308 (23 primes) */ \ + CNST_LIMB (0xFE5BA7ABA45E92FC), /* 2309 - 2500 (25 primes) */ \ + CNST_LIMB (0x158DEE6F3BF49B7D), /* 2501 - 2692 (24 primes) */ \ + CNST_LIMB (0xBE5A7BC4EDE6CD1A), /* 2693 - 2884 (26 primes) */ \ + CNST_LIMB (0xD7679B3FCA7BB6AD), /* 2885 - 3076 (22 primes) */ \ + CNST_LIMB (0xC3F66B971FEF37E9), /* 3077 - 3268 (22 primes) */ \ + CNST_LIMB (0x6F7EBCF339C953FD), /* 3269 - 3460 (22 primes) */ \ + CNST_LIMB (0xD5A5ECDCD235DBF0), /* 3461 - 3652 (27 primes) */ \ + CNST_LIMB (0xECFA7B2FD5B65E3B), /* 3653 - 3844 (22 primes) */ \ + CNST_LIMB (0xD28EFDF9C89F67B1), /* 3845 - 4036 (25 primes) */ \ + CNST_LIMB (0xCB7F7C7A3DD3AF4F), /* 4037 - 4228 (21 primes) */ \ + CNST_LIMB (0xEEBED6CDFF6B32CC), /* 4229 - 4420 (22 primes) */ \ + CNST_LIMB (0xD5BD73F85ECFA97C), /* 4421 - 4612 (23 primes) */ \ + CNST_LIMB (0x21FDBE4FBBAD48F7), /* 4613 - 4804 (24 primes) */ \ + CNST_LIMB (0x5E35A3B5EEB7FDE7), /* 4805 - 4996 (21 primes) */ \ + CNST_LIMB (0xD9EBFD53A7DBBCC9), /* 4997 - 5188 (22 primes) */ \ + CNST_LIMB (0xFF9EDEAF2EFE1F76), /* 5189 - 5380 (18 primes) */ +#define PRIMESIEVE_NUMBEROF_TABLE 28 +/* #define PRIMESIEVE_PRIMES_IN_TABLE 706 */ +#define PRIMESIEVE_HIGHEST_PRIME 5351 +/* #define PRIMESIEVE_FIRST_UNCHECKED 5381 */ + +#define SIEVE_MASK1 CNST_LIMB(0x3204C1A049120485) +#define SIEVE_MASKT CNST_LIMB(0xA1204892058) +#define SIEVE_2MSK1 CNST_LIMB(0x29048402110840A) +#define SIEVE_2MSK2 CNST_LIMB(0x9402180C40230184) +#define SIEVE_2MSKT CNST_LIMB(0x5021088402120) + diff --git a/ext/gmp/gen/aarch64-macos/mpn/add_n.s b/ext/gmp/gen/aarch64-macos/mpn/add_n.s index 136fdacc83..5f2b539e98 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/add_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/add_n.s @@ -77,7 +77,7 @@ ___gmpn_add_nc: ___gmpn_add_n: cmn xzr, xzr -Lent: lsr x18, x3, #2 +Lent: lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -86,7 +86,7 @@ Lbx1: ldr x7, [x1] str x13, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -97,7 +97,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -110,7 +110,7 @@ Lb00: ldp x4, x5, [x1] Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -123,8 +123,8 @@ Lmid: ldp x6, x7, [x1,#32]! adcs x12, x4, x8 adcs x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: adcs x12, x6, x10 adcs x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/addlsh1_n.s b/ext/gmp/gen/aarch64-macos/mpn/addlsh1_n.s index 9cde4af8e2..bf0bdcbeb9 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/addlsh1_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/addlsh1_n.s @@ -86,14 +86,14 @@ .globl ___gmpn_addlsh1_n ___gmpn_addlsh1_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x5, [x1] tbnz x3, #1, Lb11 Lb01: ldr x11, [x2] - cbz x18, L1 + cbz x6, L1 ldp x8, x9, [x2,#8] lsl x13, x11, #1 adds x15, x13, x5 @@ -115,7 +115,7 @@ Lb11: ldr x9, [x2] adds x17, x13, x5 str x17, [x0],#8 sub x1, x1, #8 - cbz x18, Lend + cbz x6, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -128,7 +128,7 @@ Lb00: adds x11, xzr, xzr Lb10: adds x9, xzr, xzr ldp x10, x11, [x2] sub x1, x1, #16 - cbz x18, Lend + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -145,8 +145,8 @@ Lmid: ldp x4, x5, [x1,#32]! adcs x16, x12, x4 adcs x17, x13, x5 stp x16, x17, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: ldp x4, x5, [x1,#16] extr x12, x10, x9, #63 diff --git a/ext/gmp/gen/aarch64-macos/mpn/addlsh2_n.s b/ext/gmp/gen/aarch64-macos/mpn/addlsh2_n.s index f923e69202..e167b893e7 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/addlsh2_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/addlsh2_n.s @@ -86,14 +86,14 @@ .globl ___gmpn_addlsh2_n ___gmpn_addlsh2_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x5, [x1] tbnz x3, #1, Lb11 Lb01: ldr x11, [x2] - cbz x18, L1 + cbz x6, L1 ldp x8, x9, [x2,#8] lsl x13, x11, #2 adds x15, x13, x5 @@ -115,7 +115,7 @@ Lb11: ldr x9, [x2] adds x17, x13, x5 str x17, [x0],#8 sub x1, x1, #8 - cbz x18, Lend + cbz x6, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -128,7 +128,7 @@ Lb00: adds x11, xzr, xzr Lb10: adds x9, xzr, xzr ldp x10, x11, [x2] sub x1, x1, #16 - cbz x18, Lend + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -145,8 +145,8 @@ Lmid: ldp x4, x5, [x1,#32]! adcs x16, x12, x4 adcs x17, x13, x5 stp x16, x17, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: ldp x4, x5, [x1,#16] extr x12, x10, x9, #62 diff --git a/ext/gmp/gen/aarch64-macos/mpn/addmul_1.s b/ext/gmp/gen/aarch64-macos/mpn/addmul_1.s index 6c82fb3de6..09ce6a3921 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/addmul_1.s +++ b/ext/gmp/gen/aarch64-macos/mpn/addmul_1.s @@ -53,6 +53,11 @@ + + + + + diff --git a/ext/gmp/gen/aarch64-macos/mpn/and_n.s b/ext/gmp/gen/aarch64-macos/mpn/and_n.s index c6d0d23cb0..6967e35beb 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/and_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/and_n.s @@ -73,7 +73,7 @@ .globl ___gmpn_and_n ___gmpn_and_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -82,7 +82,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -93,7 +93,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -104,7 +104,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -117,8 +117,8 @@ Lmid: ldp x6, x7, [x1,#32]! and x12, x4, x8 and x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: and x12, x6, x10 and x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/andn_n.s b/ext/gmp/gen/aarch64-macos/mpn/andn_n.s index 9d2318f0a3..6f928f10bc 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/andn_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/andn_n.s @@ -73,7 +73,7 @@ .globl ___gmpn_andn_n ___gmpn_andn_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -82,7 +82,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -93,7 +93,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -104,7 +104,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -117,8 +117,8 @@ Lmid: ldp x6, x7, [x1,#32]! bic x12, x4, x8 bic x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: bic x12, x6, x10 bic x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/cnd_add_n.s b/ext/gmp/gen/aarch64-macos/mpn/cnd_add_n.s index 1282e0241d..606d6c4948 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/cnd_add_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/cnd_add_n.s @@ -72,7 +72,7 @@ ___gmpn_cnd_add_n: cmn xzr, xzr - lsr x18, x4, #2 + lsr x17, x4, #2 tbz x4, #0, Lbx0 Lbx1: ldr x13, [x3] @@ -82,7 +82,7 @@ Lbx1: ldr x13, [x3] str x9, [x1] tbnz x4, #1, Lb11 -Lb01: cbz x18, Lrt +Lb01: cbz x17, Lrt ldp x12, x13, [x3,#8] ldp x10, x11, [x2,#8] sub x2, x2, #8 @@ -93,7 +93,7 @@ Lb01: cbz x18, Lrt Lb11: ldp x12, x13, [x3,#8]! ldp x10, x11, [x2,#8]! sub x1, x1, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: ldp x12, x13, [x3] @@ -106,7 +106,7 @@ Lb00: sub x2, x2, #16 b Lmid Lb10: sub x1, x1, #16 - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: bic x6, x12, x0 @@ -123,8 +123,8 @@ Lmid: bic x6, x12, x0 adcs x9, x11, x7 ldp x10, x11, [x2,#32]! stp x8, x9, [x1,#32]! - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: bic x6, x12, x0 bic x7, x13, x0 diff --git a/ext/gmp/gen/aarch64-macos/mpn/cnd_sub_n.s b/ext/gmp/gen/aarch64-macos/mpn/cnd_sub_n.s index 5663667b12..be253fe14d 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/cnd_sub_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/cnd_sub_n.s @@ -72,7 +72,7 @@ ___gmpn_cnd_sub_n: cmp xzr, xzr - lsr x18, x4, #2 + lsr x17, x4, #2 tbz x4, #0, Lbx0 Lbx1: ldr x13, [x3] @@ -82,7 +82,7 @@ Lbx1: ldr x13, [x3] str x9, [x1] tbnz x4, #1, Lb11 -Lb01: cbz x18, Lrt +Lb01: cbz x17, Lrt ldp x12, x13, [x3,#8] ldp x10, x11, [x2,#8] sub x2, x2, #8 @@ -93,7 +93,7 @@ Lb01: cbz x18, Lrt Lb11: ldp x12, x13, [x3,#8]! ldp x10, x11, [x2,#8]! sub x1, x1, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: ldp x12, x13, [x3] @@ -106,7 +106,7 @@ Lb00: sub x2, x2, #16 b Lmid Lb10: sub x1, x1, #16 - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: bic x6, x12, x0 @@ -123,8 +123,8 @@ Lmid: bic x6, x12, x0 sbcs x9, x11, x7 ldp x10, x11, [x2,#32]! stp x8, x9, [x1,#32]! - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: bic x6, x12, x0 bic x7, x13, x0 diff --git a/ext/gmp/gen/aarch64-macos/mpn/com.s b/ext/gmp/gen/aarch64-macos/mpn/com.s index 656c761ce6..5eaf1f4972 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/com.s +++ b/ext/gmp/gen/aarch64-macos/mpn/com.s @@ -45,6 +45,10 @@ + + + + @@ -60,37 +64,41 @@ ___gmpn_com: tbz x0, #3, Lal2 - ld1 {v22.1d}, [x1], #8 + ldr x4, [x1],#8 sub x2, x2, #1 - mvn v22.8b, v22.8b - st1 {v22.1d}, [x0], #8 + mvn x4, x4 + str x4, [x0],#8 -Lal2: ld1 {v26.2d}, [x1], #16 - subs x2, x2, #6 - b.lt Lend +Lal2: ldp x4,x5, [x1],#16 + sub x2, x2, #6 + tbnz x2, #63, Lend .align 4 -Ltop: ld1 {v22.2d}, [x1], #16 - mvn v26.16b, v26.16b - st1 {v26.2d}, [x0], #16 - ld1 {v26.2d}, [x1], #16 - mvn v22.16b, v22.16b - st1 {v22.2d}, [x0], #16 - subs x2, x2, #4 - b.ge Ltop +Ltop: ldp x6,x7, [x1],#32 + mvn x4, x4 + mvn x5, x5 + stp x4,x5, [x0],#32 + ldp x4,x5, [x1,#-16] + mvn x6, x6 + mvn x7, x7 + stp x6,x7, [x0,#-16] + sub x2, x2, #4 + tbz x2, #63, Ltop -Lend: mvn v26.16b, v26.16b - st1 {v26.2d}, [x0], #16 +Lend: mvn x4, x4 + mvn x5, x5 + stp x4,x5, [x0],#16 Lbc: tbz x2, #1, Ltl1 - ld1 {v22.2d}, [x1], #16 - mvn v22.16b, v22.16b - st1 {v22.2d}, [x0], #16 + ldp x4,x5, [x1],#16 + mvn x4, x4 + mvn x5, x5 + stp x4,x5, [x0],#16 Ltl1: tbz x2, #0, Ltl2 - ld1 {v22.1d}, [x1] - mvn v22.8b, v22.8b - st1 {v22.1d}, [x0] + ldr x4, [x1] + mvn x4, x4 + str x4, [x0] Ltl2: ret diff --git a/ext/gmp/gen/aarch64-macos/mpn/copyd.s b/ext/gmp/gen/aarch64-macos/mpn/copyd.s index 1b178c7c40..a3a3af68be 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/copyd.s +++ b/ext/gmp/gen/aarch64-macos/mpn/copyd.s @@ -43,6 +43,11 @@ + + + + + @@ -62,44 +67,31 @@ ___gmpn_copyd: tbz x0, #3, Lal2 - sub x1, x1, #8 - ld1 {v22.1d}, [x1] + ldr x4, [x1,#-8]! sub x2, x2, #1 - sub x0, x0, #8 - st1 {v22.1d}, [x0] + str x4, [x0,#-8]! -Lal2: sub x1, x1, #16 - ld1 {v26.2d}, [x1] +Lal2: ldp x4,x5, [x1,#-16]! sub x2, x2, #6 - sub x0, x0, #16 tbnz x2, #63, Lend - sub x1, x1, #16 - mov x12, #-16 - .align 4 -Ltop: ld1 {v22.2d}, [x1], x12 - st1 {v26.2d}, [x0], x12 - ld1 {v26.2d}, [x1], x12 - st1 {v22.2d}, [x0], x12 +Ltop: ldp x6,x7, [x1,#-16] + stp x4,x5, [x0,#-16] + ldp x4,x5, [x1,#-32]! + stp x6,x7, [x0,#-32]! sub x2, x2, #4 tbz x2, #63, Ltop - add x1, x1, #16 - -Lend: st1 {v26.2d}, [x0] +Lend: stp x4,x5, [x0,#-16]! Lbc: tbz x2, #1, Ltl1 - sub x1, x1, #16 - ld1 {v22.2d}, [x1] - sub x0, x0, #16 - st1 {v22.2d}, [x0] + ldp x4,x5, [x1,#-16]! + stp x4,x5, [x0,#-16]! Ltl1: tbz x2, #0, Ltl2 - sub x1, x1, #8 - ld1 {v22.1d}, [x1] - sub x0, x0, #8 - st1 {v22.1d}, [x0] + ldr x4, [x1,#-8] + str x4, [x0,#-8] Ltl2: ret diff --git a/ext/gmp/gen/aarch64-macos/mpn/copyi.s b/ext/gmp/gen/aarch64-macos/mpn/copyi.s index 95e54eaefd..b87f4fcc6f 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/copyi.s +++ b/ext/gmp/gen/aarch64-macos/mpn/copyi.s @@ -45,6 +45,10 @@ + + + + @@ -60,31 +64,31 @@ ___gmpn_copyi: tbz x0, #3, Lal2 - ld1 {v22.1d}, [x1], #8 + ldr x4, [x1],#8 sub x2, x2, #1 - st1 {v22.1d}, [x0], #8 + str x4, [x0],#8 -Lal2: ld1 {v26.2d}, [x1], #16 +Lal2: ldp x4,x5, [x1],#16 sub x2, x2, #6 tbnz x2, #63, Lend .align 4 -Ltop: ld1 {v22.2d}, [x1], #16 - st1 {v26.2d}, [x0], #16 - ld1 {v26.2d}, [x1], #16 - st1 {v22.2d}, [x0], #16 +Ltop: ldp x6,x7, [x1],#32 + stp x4,x5, [x0],#32 + ldp x4,x5, [x1,#-16] + stp x6,x7, [x0,#-16] sub x2, x2, #4 tbz x2, #63, Ltop -Lend: st1 {v26.2d}, [x0], #16 +Lend: stp x4,x5, [x0],#16 Lbc: tbz x2, #1, Ltl1 - ld1 {v22.2d}, [x1], #16 - st1 {v22.2d}, [x0], #16 + ldp x4,x5, [x1],#16 + stp x4,x5, [x0],#16 Ltl1: tbz x2, #0, Ltl2 - ld1 {v22.1d}, [x1] - st1 {v22.1d}, [x0] + ldr x4, [x1] + str x4, [x0] Ltl2: ret diff --git a/ext/gmp/gen/aarch64-macos/mpn/ior_n.s b/ext/gmp/gen/aarch64-macos/mpn/ior_n.s index cfd315a938..4b4b643ece 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/ior_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/ior_n.s @@ -73,7 +73,7 @@ .globl ___gmpn_ior_n ___gmpn_ior_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -82,7 +82,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -93,7 +93,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -104,7 +104,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -117,8 +117,8 @@ Lmid: ldp x6, x7, [x1,#32]! orr x12, x4, x8 orr x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: orr x12, x6, x10 orr x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/iorn_n.s b/ext/gmp/gen/aarch64-macos/mpn/iorn_n.s index 94cc90a0ff..73d86e94a0 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/iorn_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/iorn_n.s @@ -73,7 +73,7 @@ .globl ___gmpn_iorn_n ___gmpn_iorn_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -82,7 +82,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -93,7 +93,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -104,7 +104,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -117,8 +117,8 @@ Lmid: ldp x6, x7, [x1,#32]! orn x12, x4, x8 orn x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: orn x12, x6, x10 orn x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/lshift.s b/ext/gmp/gen/aarch64-macos/mpn/lshift.s index 75dc0fbc9a..14e5a71bd2 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/lshift.s +++ b/ext/gmp/gen/aarch64-macos/mpn/lshift.s @@ -73,7 +73,7 @@ ___gmpn_lshift: add x16, x0, x2, lsl #3 add x1, x1, x2, lsl #3 sub x8, xzr, x3 - lsr x18, x2, #2 + lsr x17, x2, #2 tbz x2, #0, Lbx0 Lbx1: ldr x4, [x1,#-8] @@ -81,7 +81,7 @@ Lbx1: ldr x4, [x1,#-8] Lb01: lsr x0, x4, x8 lsl x2, x4, x3 - cbnz x18, Lgt1 + cbnz x17, Lgt1 str x2, [x16,#-8] ret Lgt1: ldp x4, x5, [x1,#-24] @@ -101,7 +101,7 @@ Lb10: lsr x0, x5, x8 lsl x13, x5, x3 lsr x10, x4, x8 lsl x2, x4, x3 - cbnz x18, Lgt2 + cbnz x17, Lgt2 orr x10, x10, x13 stp x2, x10, [x16,#-16] ret @@ -135,11 +135,11 @@ Llo2: lsr x10, x4, x8 orr x11, x12, x2 stp x10, x11, [x16,#-32]! lsl x2, x4, x3 -Llo0: sub x18, x18, #1 +Llo0: sub x17, x17, #1 Llo3: lsr x10, x6, x8 lsl x13, x7, x3 lsr x12, x7, x8 - cbnz x18, Ltop + cbnz x17, Ltop Lend: orr x10, x10, x13 orr x11, x12, x2 diff --git a/ext/gmp/gen/aarch64-macos/mpn/lshiftc.s b/ext/gmp/gen/aarch64-macos/mpn/lshiftc.s index c4e0b33084..438136f554 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/lshiftc.s +++ b/ext/gmp/gen/aarch64-macos/mpn/lshiftc.s @@ -73,7 +73,7 @@ ___gmpn_lshiftc: add x16, x0, x2, lsl #3 add x1, x1, x2, lsl #3 sub x8, xzr, x3 - lsr x18, x2, #2 + lsr x17, x2, #2 tbz x2, #0, Lbx0 Lbx1: ldr x4, [x1,#-8] @@ -81,7 +81,7 @@ Lbx1: ldr x4, [x1,#-8] Lb01: lsr x0, x4, x8 lsl x2, x4, x3 - cbnz x18, Lgt1 + cbnz x17, Lgt1 mvn x2, x2 str x2, [x16,#-8] ret @@ -102,7 +102,7 @@ Lb10: lsr x0, x5, x8 lsl x13, x5, x3 lsr x10, x4, x8 lsl x2, x4, x3 - cbnz x18, Lgt2 + cbnz x17, Lgt2 eon x10, x10, x13 mvn x2, x2 stp x2, x10, [x16,#-16] @@ -137,11 +137,11 @@ Llo2: lsr x10, x4, x8 eon x11, x12, x2 stp x10, x11, [x16,#-32]! lsl x2, x4, x3 -Llo0: sub x18, x18, #1 +Llo0: sub x17, x17, #1 Llo3: lsr x10, x6, x8 lsl x13, x7, x3 lsr x12, x7, x8 - cbnz x18, Ltop + cbnz x17, Ltop Lend: eon x10, x10, x13 eon x11, x12, x2 diff --git a/ext/gmp/gen/aarch64-macos/mpn/mul_1.s b/ext/gmp/gen/aarch64-macos/mpn/mul_1.s index 9a369b1627..7858152199 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/mul_1.s +++ b/ext/gmp/gen/aarch64-macos/mpn/mul_1.s @@ -54,6 +54,7 @@ + .text @@ -71,7 +72,7 @@ ___gmpn_mul_1c: ___gmpn_mul_1: adds x4, xzr, xzr -Lcom: lsr x18, x2, #2 +Lcom: lsr x17, x2, #2 tbnz x2, #0, Lbx1 Lbx0: mov x11, x4 @@ -80,7 +81,7 @@ Lbx0: mov x11, x4 Lb10: ldp x4, x5, [x1] mul x8, x4, x3 umulh x10, x4, x3 - cbz x18, L2 + cbz x17, L2 ldp x6, x7, [x1,#16]! mul x9, x5, x3 b Lmid-8 @@ -95,7 +96,7 @@ Lbx1: ldr x7, [x1],#8 str x9, [x0],#8 tbnz x2, #1, Lb10 -Lb01: cbz x18, L1 +Lb01: cbz x17, L1 Lb00: ldp x6, x7, [x1] mul x8, x6, x3 @@ -105,8 +106,8 @@ Lb00: ldp x6, x7, [x1] adcs x12, x8, x11 umulh x11, x7, x3 add x0, x0, #16 - sub x18, x18, #1 - cbz x18, Lend + sub x17, x17, #1 + cbz x17, Lend .align 4 Ltop: mul x8, x4, x3 @@ -125,8 +126,8 @@ Lmid: mul x8, x6, x3 stp x12, x13, [x0],#32 adcs x12, x8, x11 umulh x11, x7, x3 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: mul x8, x4, x3 adcs x13, x9, x10 diff --git a/ext/gmp/gen/aarch64-macos/mpn/nand_n.s b/ext/gmp/gen/aarch64-macos/mpn/nand_n.s index dd75975513..083703a2d8 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/nand_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/nand_n.s @@ -74,7 +74,7 @@ .globl ___gmpn_nand_n ___gmpn_nand_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -84,7 +84,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -95,7 +95,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -106,7 +106,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -123,8 +123,8 @@ Lmid: ldp x6, x7, [x1,#32]! mvn x12, x12 mvn x13, x13 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: and x12, x6, x10 and x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/nior_n.s b/ext/gmp/gen/aarch64-macos/mpn/nior_n.s index 03a2061d69..392a012812 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/nior_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/nior_n.s @@ -74,7 +74,7 @@ .globl ___gmpn_nior_n ___gmpn_nior_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -84,7 +84,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -95,7 +95,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -106,7 +106,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -123,8 +123,8 @@ Lmid: ldp x6, x7, [x1,#32]! mvn x12, x12 mvn x13, x13 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: orr x12, x6, x10 orr x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/rsblsh1_n.s b/ext/gmp/gen/aarch64-macos/mpn/rsblsh1_n.s index 40fa58cb1b..2d5cf138c0 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/rsblsh1_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/rsblsh1_n.s @@ -86,14 +86,14 @@ .globl ___gmpn_rsblsh1_n ___gmpn_rsblsh1_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x5, [x1] tbnz x3, #1, Lb11 Lb01: ldr x11, [x2] - cbz x18, L1 + cbz x6, L1 ldp x8, x9, [x2,#8] lsl x13, x11, #1 subs x15, x13, x5 @@ -115,7 +115,7 @@ Lb11: ldr x9, [x2] subs x17, x13, x5 str x17, [x0],#8 sub x1, x1, #8 - cbz x18, Lend + cbz x6, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -128,7 +128,7 @@ Lb00: subs x11, xzr, xzr Lb10: subs x9, xzr, xzr ldp x10, x11, [x2] sub x1, x1, #16 - cbz x18, Lend + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -145,8 +145,8 @@ Lmid: ldp x4, x5, [x1,#32]! sbcs x16, x12, x4 sbcs x17, x13, x5 stp x16, x17, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: ldp x4, x5, [x1,#16] extr x12, x10, x9, #63 diff --git a/ext/gmp/gen/aarch64-macos/mpn/rsblsh2_n.s b/ext/gmp/gen/aarch64-macos/mpn/rsblsh2_n.s index 91d6002042..1e79fe1257 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/rsblsh2_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/rsblsh2_n.s @@ -86,14 +86,14 @@ .globl ___gmpn_rsblsh2_n ___gmpn_rsblsh2_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x5, [x1] tbnz x3, #1, Lb11 Lb01: ldr x11, [x2] - cbz x18, L1 + cbz x6, L1 ldp x8, x9, [x2,#8] lsl x13, x11, #2 subs x15, x13, x5 @@ -115,7 +115,7 @@ Lb11: ldr x9, [x2] subs x17, x13, x5 str x17, [x0],#8 sub x1, x1, #8 - cbz x18, Lend + cbz x6, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -128,7 +128,7 @@ Lb00: subs x11, xzr, xzr Lb10: subs x9, xzr, xzr ldp x10, x11, [x2] sub x1, x1, #16 - cbz x18, Lend + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -145,8 +145,8 @@ Lmid: ldp x4, x5, [x1,#32]! sbcs x16, x12, x4 sbcs x17, x13, x5 stp x16, x17, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: ldp x4, x5, [x1,#16] extr x12, x10, x9, #62 diff --git a/ext/gmp/gen/aarch64-macos/mpn/rsh1add_n.s b/ext/gmp/gen/aarch64-macos/mpn/rsh1add_n.s index 3920913f18..0e46013bda 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/rsh1add_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/rsh1add_n.s @@ -66,7 +66,7 @@ .globl ___gmpn_rsh1add_n ___gmpn_rsh1add_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 @@ -76,7 +76,7 @@ Lbx1: ldr x5, [x1],#8 Lb01: adds x13, x5, x9 and x10, x13, #1 - cbz x18, L1 + cbz x6, L1 ldp x4, x5, [x1],#48 ldp x8, x9, [x2],#48 adcs x14, x4, x8 @@ -87,8 +87,8 @@ Lb01: adds x13, x5, x9 adcs x12, x4, x8 adcs x13, x5, x9 str x17, [x0], #24 - sub x18, x18, #1 - cbz x18, Lend + sub x6, x6, #1 + cbz x6, Lend b Ltop L1: cset x14, cs @@ -104,7 +104,7 @@ Lb11: adds x15, x5, x9 ldp x8, x9, [x2],#32 adcs x12, x4, x8 adcs x13, x5, x9 - cbz x18, L3 + cbz x6, L3 ldp x4, x5, [x1,#-16] ldp x8, x9, [x2,#-16] extr x17, x12, x15, #1 @@ -124,7 +124,7 @@ Lb10: ldp x4, x5, [x1],#32 adds x12, x4, x8 adcs x13, x5, x9 and x10, x12, #1 - cbz x18, L2 + cbz x6, L2 ldp x4, x5, [x1,#-16] ldp x8, x9, [x2,#-16] adcs x14, x4, x8 @@ -141,8 +141,8 @@ Lb00: ldp x4, x5, [x1],#48 adcs x12, x4, x8 adcs x13, x5, x9 add x0, x0, #16 - sub x18, x18, #1 - cbz x18, Lend + sub x6, x6, #1 + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#-16] @@ -159,8 +159,8 @@ Lmid: ldp x4, x5, [x1],#32 adcs x12, x4, x8 adcs x13, x5, x9 stp x16, x17, [x0],#32 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: extr x16, x15, x14, #1 extr x17, x12, x15, #1 diff --git a/ext/gmp/gen/aarch64-macos/mpn/rsh1sub_n.s b/ext/gmp/gen/aarch64-macos/mpn/rsh1sub_n.s index 745db9ec02..2600b60f32 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/rsh1sub_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/rsh1sub_n.s @@ -66,7 +66,7 @@ .globl ___gmpn_rsh1sub_n ___gmpn_rsh1sub_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 @@ -76,7 +76,7 @@ Lbx1: ldr x5, [x1],#8 Lb01: subs x13, x5, x9 and x10, x13, #1 - cbz x18, L1 + cbz x6, L1 ldp x4, x5, [x1],#48 ldp x8, x9, [x2],#48 sbcs x14, x4, x8 @@ -87,8 +87,8 @@ Lb01: subs x13, x5, x9 sbcs x12, x4, x8 sbcs x13, x5, x9 str x17, [x0], #24 - sub x18, x18, #1 - cbz x18, Lend + sub x6, x6, #1 + cbz x6, Lend b Ltop L1: cset x14, cc @@ -104,7 +104,7 @@ Lb11: subs x15, x5, x9 ldp x8, x9, [x2],#32 sbcs x12, x4, x8 sbcs x13, x5, x9 - cbz x18, L3 + cbz x6, L3 ldp x4, x5, [x1,#-16] ldp x8, x9, [x2,#-16] extr x17, x12, x15, #1 @@ -124,7 +124,7 @@ Lb10: ldp x4, x5, [x1],#32 subs x12, x4, x8 sbcs x13, x5, x9 and x10, x12, #1 - cbz x18, L2 + cbz x6, L2 ldp x4, x5, [x1,#-16] ldp x8, x9, [x2,#-16] sbcs x14, x4, x8 @@ -141,8 +141,8 @@ Lb00: ldp x4, x5, [x1],#48 sbcs x12, x4, x8 sbcs x13, x5, x9 add x0, x0, #16 - sub x18, x18, #1 - cbz x18, Lend + sub x6, x6, #1 + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#-16] @@ -159,8 +159,8 @@ Lmid: ldp x4, x5, [x1],#32 sbcs x12, x4, x8 sbcs x13, x5, x9 stp x16, x17, [x0],#32 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: extr x16, x15, x14, #1 extr x17, x12, x15, #1 diff --git a/ext/gmp/gen/aarch64-macos/mpn/rshift.s b/ext/gmp/gen/aarch64-macos/mpn/rshift.s index 472e5bc426..a27751987b 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/rshift.s +++ b/ext/gmp/gen/aarch64-macos/mpn/rshift.s @@ -72,7 +72,7 @@ ___gmpn_rshift: mov x16, x0 sub x8, xzr, x3 - lsr x18, x2, #2 + lsr x17, x2, #2 tbz x2, #0, Lbx0 Lbx1: ldr x5, [x1] @@ -80,7 +80,7 @@ Lbx1: ldr x5, [x1] Lb01: lsl x0, x5, x8 lsr x2, x5, x3 - cbnz x18, Lgt1 + cbnz x17, Lgt1 str x2, [x16] ret Lgt1: ldp x4, x5, [x1,#8] @@ -101,7 +101,7 @@ Lb10: lsl x0, x4, x8 lsr x13, x4, x3 lsl x10, x5, x8 lsr x2, x5, x3 - cbnz x18, Lgt2 + cbnz x17, Lgt2 orr x10, x10, x13 stp x10, x2, [x16] ret @@ -133,11 +133,11 @@ Llo2: lsl x10, x5, x8 orr x11, x12, x2 stp x11, x10, [x16,#32]! lsr x2, x5, x3 -Llo0: sub x18, x18, #1 +Llo0: sub x17, x17, #1 Llo3: lsl x10, x7, x8 lsl x12, x6, x8 lsr x13, x6, x3 - cbnz x18, Ltop + cbnz x17, Ltop Lend: orr x10, x10, x13 orr x11, x12, x2 diff --git a/ext/gmp/gen/aarch64-macos/mpn/sqr_diag_addlsh1.s b/ext/gmp/gen/aarch64-macos/mpn/sqr_diag_addlsh1.s index 0e01e2858a..0255158b72 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/sqr_diag_addlsh1.s +++ b/ext/gmp/gen/aarch64-macos/mpn/sqr_diag_addlsh1.s @@ -58,7 +58,7 @@ ___gmpn_sqr_diag_addlsh1: ldr x15, [x2],#8 - lsr x18, x3, #1 + lsr x14, x3, #1 tbz x3, #0, Lbx0 Lbx1: adds x7, xzr, xzr @@ -73,8 +73,8 @@ Lbx0: adds x5, xzr, xzr ldr x17, [x2],#16 ldp x6, x7, [x1],#32 umulh x11, x15, x15 - sub x18, x18, #1 - cbz x18, Lend + sub x14, x14, #1 + cbz x14, Lend .align 4 Ltop: extr x9, x6, x5, #63 @@ -95,8 +95,8 @@ Lmid: extr x9, x4, x7, #63 extr x8, x5, x4, #63 stp x12, x13, [x0],#16 adcs x12, x8, x10 - sub x18, x18, #1 - cbnz x18, Ltop + sub x14, x14, #1 + cbnz x14, Ltop Lend: extr x9, x6, x5, #63 mul x10, x17, x17 diff --git a/ext/gmp/gen/aarch64-macos/mpn/sub_n.s b/ext/gmp/gen/aarch64-macos/mpn/sub_n.s index 0ed940928d..3695521862 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/sub_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/sub_n.s @@ -77,7 +77,7 @@ ___gmpn_sub_nc: ___gmpn_sub_n: cmp xzr, xzr -Lent: lsr x18, x3, #2 +Lent: lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -86,7 +86,7 @@ Lbx1: ldr x7, [x1] str x13, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -97,7 +97,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -110,7 +110,7 @@ Lb00: ldp x4, x5, [x1] Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -123,8 +123,8 @@ Lmid: ldp x6, x7, [x1,#32]! sbcs x12, x4, x8 sbcs x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: sbcs x12, x6, x10 sbcs x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/sublsh1_n.s b/ext/gmp/gen/aarch64-macos/mpn/sublsh1_n.s index 7bc7204291..e3e924f379 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/sublsh1_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/sublsh1_n.s @@ -86,14 +86,14 @@ .globl ___gmpn_sublsh1_n ___gmpn_sublsh1_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x5, [x1] tbnz x3, #1, Lb11 Lb01: ldr x11, [x2] - cbz x18, L1 + cbz x6, L1 ldp x8, x9, [x2,#8] lsl x13, x11, #1 subs x15, x5, x13 @@ -115,7 +115,7 @@ Lb11: ldr x9, [x2] subs x17, x5, x13 str x17, [x0],#8 sub x1, x1, #8 - cbz x18, Lend + cbz x6, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -128,7 +128,7 @@ Lb00: subs x11, xzr, xzr Lb10: subs x9, xzr, xzr ldp x10, x11, [x2] sub x1, x1, #16 - cbz x18, Lend + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -145,8 +145,8 @@ Lmid: ldp x4, x5, [x1,#32]! sbcs x16, x4, x12 sbcs x17, x5, x13 stp x16, x17, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: ldp x4, x5, [x1,#16] extr x12, x10, x9, #63 diff --git a/ext/gmp/gen/aarch64-macos/mpn/sublsh2_n.s b/ext/gmp/gen/aarch64-macos/mpn/sublsh2_n.s index 3b37de4c79..0df8084a3b 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/sublsh2_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/sublsh2_n.s @@ -86,14 +86,14 @@ .globl ___gmpn_sublsh2_n ___gmpn_sublsh2_n: - lsr x18, x3, #2 + lsr x6, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x5, [x1] tbnz x3, #1, Lb11 Lb01: ldr x11, [x2] - cbz x18, L1 + cbz x6, L1 ldp x8, x9, [x2,#8] lsl x13, x11, #2 subs x15, x5, x13 @@ -115,7 +115,7 @@ Lb11: ldr x9, [x2] subs x17, x5, x13 str x17, [x0],#8 sub x1, x1, #8 - cbz x18, Lend + cbz x6, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -128,7 +128,7 @@ Lb00: subs x11, xzr, xzr Lb10: subs x9, xzr, xzr ldp x10, x11, [x2] sub x1, x1, #16 - cbz x18, Lend + cbz x6, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -145,8 +145,8 @@ Lmid: ldp x4, x5, [x1,#32]! sbcs x16, x4, x12 sbcs x17, x5, x13 stp x16, x17, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x6, x6, #1 + cbnz x6, Ltop Lend: ldp x4, x5, [x1,#16] extr x12, x10, x9, #62 diff --git a/ext/gmp/gen/aarch64-macos/mpn/submul_1.s b/ext/gmp/gen/aarch64-macos/mpn/submul_1.s index 439b82096b..11f80f4673 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/submul_1.s +++ b/ext/gmp/gen/aarch64-macos/mpn/submul_1.s @@ -54,6 +54,11 @@ + + + + + diff --git a/ext/gmp/gen/aarch64-macos/mpn/xnor_n.s b/ext/gmp/gen/aarch64-macos/mpn/xnor_n.s index 5ddbb4a9d2..345378e20c 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/xnor_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/xnor_n.s @@ -73,7 +73,7 @@ .globl ___gmpn_xnor_n ___gmpn_xnor_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -82,7 +82,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -93,7 +93,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -104,7 +104,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -117,8 +117,8 @@ Lmid: ldp x6, x7, [x1,#32]! eon x12, x4, x8 eon x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: eon x12, x6, x10 eon x13, x7, x11 diff --git a/ext/gmp/gen/aarch64-macos/mpn/xor_n.s b/ext/gmp/gen/aarch64-macos/mpn/xor_n.s index d696c28867..2b59a4b079 100644 --- a/ext/gmp/gen/aarch64-macos/mpn/xor_n.s +++ b/ext/gmp/gen/aarch64-macos/mpn/xor_n.s @@ -73,7 +73,7 @@ .globl ___gmpn_xor_n ___gmpn_xor_n: - lsr x18, x3, #2 + lsr x17, x3, #2 tbz x3, #0, Lbx0 Lbx1: ldr x7, [x1] @@ -82,7 +82,7 @@ Lbx1: ldr x7, [x1] str x15, [x0],#8 tbnz x3, #1, Lb11 -Lb01: cbz x18, Lret +Lb01: cbz x17, Lret ldp x4, x5, [x1,#8] ldp x8, x9, [x2,#8] sub x1, x1, #8 @@ -93,7 +93,7 @@ Lb11: ldp x6, x7, [x1,#8] ldp x10, x11, [x2,#8] add x1, x1, #8 add x2, x2, #8 - cbz x18, Lend + cbz x17, Lend b Ltop Lbx0: tbnz x3, #1, Lb10 @@ -104,7 +104,7 @@ Lb00: ldp x4, x5, [x1],#-16 Lb10: ldp x6, x7, [x1] ldp x10, x11, [x2] - cbz x18, Lend + cbz x17, Lend .align 4 Ltop: ldp x4, x5, [x1,#16] @@ -117,8 +117,8 @@ Lmid: ldp x6, x7, [x1,#32]! eor x12, x4, x8 eor x13, x5, x9 stp x12, x13, [x0],#16 - sub x18, x18, #1 - cbnz x18, Ltop + sub x17, x17, #1 + cbnz x17, Ltop Lend: eor x12, x6, x10 eor x13, x7, x11 diff --git a/ext/gmp/gen/x86_64-linux/sieve_table.h b/ext/gmp/gen/x86_64-linux/sieve_table.h new file mode 100644 index 0000000000..ee9ac14360 --- /dev/null +++ b/ext/gmp/gen/x86_64-linux/sieve_table.h @@ -0,0 +1,46 @@ +/* This file generated by gen-sieve.c - DO NOT EDIT. */ + +#if GMP_LIMB_BITS != 64 +Error, error, this data is for 64 bits +#endif + +#define PRIMESIEVE_INIT_TABLE \ + CNST_LIMB (0x3294C9E069128480), /* 5 - 196 (42 primes) */ \ + CNST_LIMB (0x95A35E1EC4AB21DC), /* 197 - 388 (32 primes) */ \ + CNST_LIMB (0x4AD7CE99B8693366), /* 389 - 580 (30 primes) */ \ + CNST_LIMB (0x6595B6DA728DC52B), /* 581 - 772 (30 primes) */ \ + CNST_LIMB (0xEA6D9F8787B0CEDE), /* 773 - 964 (26 primes) */ \ + CNST_LIMB (0x3F56A1F4CD3275A9), /* 965 - 1156 (29 primes) */ \ + CNST_LIMB (0xFD3848FB74A76ADB), /* 1157 - 1348 (26 primes) */ \ + CNST_LIMB (0xDBBA0DD1A1EDF6AF), /* 1349 - 1540 (25 primes) */ \ + CNST_LIMB (0xCEC7F17ED22799A5), /* 1541 - 1732 (27 primes) */ \ + CNST_LIMB (0xEAEC17BDBB717D56), /* 1733 - 1924 (24 primes) */ \ + CNST_LIMB (0x3B0EB7B3585AFCF3), /* 1925 - 2116 (26 primes) */ \ + CNST_LIMB (0xE563D8F69FDF6C4F), /* 2117 - 2308 (23 primes) */ \ + CNST_LIMB (0xFE5BA7ABA45E92FC), /* 2309 - 2500 (25 primes) */ \ + CNST_LIMB (0x158DEE6F3BF49B7D), /* 2501 - 2692 (24 primes) */ \ + CNST_LIMB (0xBE5A7BC4EDE6CD1A), /* 2693 - 2884 (26 primes) */ \ + CNST_LIMB (0xD7679B3FCA7BB6AD), /* 2885 - 3076 (22 primes) */ \ + CNST_LIMB (0xC3F66B971FEF37E9), /* 3077 - 3268 (22 primes) */ \ + CNST_LIMB (0x6F7EBCF339C953FD), /* 3269 - 3460 (22 primes) */ \ + CNST_LIMB (0xD5A5ECDCD235DBF0), /* 3461 - 3652 (27 primes) */ \ + CNST_LIMB (0xECFA7B2FD5B65E3B), /* 3653 - 3844 (22 primes) */ \ + CNST_LIMB (0xD28EFDF9C89F67B1), /* 3845 - 4036 (25 primes) */ \ + CNST_LIMB (0xCB7F7C7A3DD3AF4F), /* 4037 - 4228 (21 primes) */ \ + CNST_LIMB (0xEEBED6CDFF6B32CC), /* 4229 - 4420 (22 primes) */ \ + CNST_LIMB (0xD5BD73F85ECFA97C), /* 4421 - 4612 (23 primes) */ \ + CNST_LIMB (0x21FDBE4FBBAD48F7), /* 4613 - 4804 (24 primes) */ \ + CNST_LIMB (0x5E35A3B5EEB7FDE7), /* 4805 - 4996 (21 primes) */ \ + CNST_LIMB (0xD9EBFD53A7DBBCC9), /* 4997 - 5188 (22 primes) */ \ + CNST_LIMB (0xFF9EDEAF2EFE1F76), /* 5189 - 5380 (18 primes) */ +#define PRIMESIEVE_NUMBEROF_TABLE 28 +/* #define PRIMESIEVE_PRIMES_IN_TABLE 706 */ +#define PRIMESIEVE_HIGHEST_PRIME 5351 +/* #define PRIMESIEVE_FIRST_UNCHECKED 5381 */ + +#define SIEVE_MASK1 CNST_LIMB(0x3204C1A049120485) +#define SIEVE_MASKT CNST_LIMB(0xA1204892058) +#define SIEVE_2MSK1 CNST_LIMB(0x29048402110840A) +#define SIEVE_2MSK2 CNST_LIMB(0x9402180C40230184) +#define SIEVE_2MSKT CNST_LIMB(0x5021088402120) + From c75bd00c40246ba69425c60eb92cfa52f0e64950 Mon Sep 17 00:00:00 2001 From: nathanlever Date: Thu, 17 Oct 2024 16:43:43 +0300 Subject: [PATCH 93/97] build: upgrade gmp to 6.3.0 on x86_64-macos --- ext/gmp/gen/x86_64-macos/config.h | 18 +- ext/gmp/gen/x86_64-macos/mpn/addlsh_n.s | 1 + ext/gmp/gen/x86_64-macos/mpn/addmul_1.s | 2 + ext/gmp/gen/x86_64-macos/mpn/mul_basecase.s | 3 +- ext/gmp/gen/x86_64-macos/mpn/rsblsh_n.s | 1 + ext/gmp/gen/x86_64-macos/mpn/sbpi1_bdiv_r.s | 681 ++++++++++++++++++++ ext/gmp/gen/x86_64-macos/sieve_table.h | 46 ++ 7 files changed, 743 insertions(+), 9 deletions(-) create mode 100644 ext/gmp/gen/x86_64-macos/mpn/sbpi1_bdiv_r.s create mode 100644 ext/gmp/gen/x86_64-macos/sieve_table.h diff --git a/ext/gmp/gen/x86_64-macos/config.h b/ext/gmp/gen/x86_64-macos/config.h index 1fbed06fdb..72cebebce1 100644 --- a/ext/gmp/gen/x86_64-macos/config.h +++ b/ext/gmp/gen/x86_64-macos/config.h @@ -3,7 +3,7 @@ /* -Copyright 1996-2020 Free Software Foundation, Inc. +Copyright 1996-2022 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -187,6 +187,7 @@ see https://www.gnu.org/licenses/. /* #undef HAVE_HOST_CPU_skylake */ /* #undef HAVE_HOST_CPU_silvermont */ /* #undef HAVE_HOST_CPU_goldmont */ +/* #undef HAVE_HOST_CPU_tremont */ /* #undef HAVE_HOST_CPU_k8 */ /* #undef HAVE_HOST_CPU_k10 */ /* #undef HAVE_HOST_CPU_bulldozer */ @@ -201,6 +202,9 @@ see https://www.gnu.org/licenses/. /* #undef HAVE_HOST_CPU_s390_z9 */ /* #undef HAVE_HOST_CPU_s390_z10 */ /* #undef HAVE_HOST_CPU_s390_z196 */ +/* #undef HAVE_HOST_CPU_s390_z13 */ +/* #undef HAVE_HOST_CPU_s390_z14 */ +/* #undef HAVE_HOST_CPU_s390_z15 */ /* Define to 1 iff we have a s390 with 64-bit registers. */ /* #undef HAVE_HOST_CPU_s390_zarch */ @@ -258,7 +262,7 @@ see https://www.gnu.org/licenses/. #define HAVE_NATIVE_mpn_add_n 1 /* #undef HAVE_NATIVE_mpn_add_n_sub_n */ #define HAVE_NATIVE_mpn_add_nc 1 -#define HAVE_NATIVE_mpn_addaddmul_1msb0 1 +/* #undef HAVE_NATIVE_mpn_addaddmul_1msb0 */ #define HAVE_NATIVE_mpn_addlsh1_n 1 #define HAVE_NATIVE_mpn_addlsh2_n 1 #define HAVE_NATIVE_mpn_addlsh_n 1 @@ -348,7 +352,7 @@ see https://www.gnu.org/licenses/. #define HAVE_NATIVE_mpn_rsh1sub_n 1 #define HAVE_NATIVE_mpn_rsh1sub_nc 1 #define HAVE_NATIVE_mpn_rshift 1 -/* #undef HAVE_NATIVE_mpn_sbpi1_bdiv_r */ +#define HAVE_NATIVE_mpn_sbpi1_bdiv_r 1 #define HAVE_NATIVE_mpn_sqr_basecase 1 /* #undef HAVE_NATIVE_mpn_sqr_diagonal */ #define HAVE_NATIVE_mpn_sqr_diag_addlsh1 1 @@ -542,13 +546,13 @@ see https://www.gnu.org/licenses/. #define PACKAGE "gmp" /* Define to the address where bug reports for this package should be sent. */ -#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html" +#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org (see https://gmplib.org/manual/Reporting-Bugs.html)" /* Define to the full name of this package. */ #define PACKAGE_NAME "GNU MP" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "GNU MP 6.2.1" +#define PACKAGE_STRING "GNU MP 6.3.0" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "gmp" @@ -557,7 +561,7 @@ see https://www.gnu.org/licenses/. #define PACKAGE_URL "http://www.gnu.org/software/gmp/" /* Define to the version of this package. */ -#define PACKAGE_VERSION "6.2.1" +#define PACKAGE_VERSION "6.3.0" /* Define as the return type of signal handlers (`int' or `void'). */ #define RETSIGTYPE void @@ -590,7 +594,7 @@ see https://www.gnu.org/licenses/. /* #undef TUNE_SQR_TOOM2_MAX */ /* Version number of package */ -#define VERSION "6.2.1" +#define VERSION "6.3.0" /* Define to 1 to enable ASSERT checking, per --enable-assert */ /* #undef WANT_ASSERT */ diff --git a/ext/gmp/gen/x86_64-macos/mpn/addlsh_n.s b/ext/gmp/gen/x86_64-macos/mpn/addlsh_n.s index 09035432cf..f71088eb48 100644 --- a/ext/gmp/gen/x86_64-macos/mpn/addlsh_n.s +++ b/ext/gmp/gen/x86_64-macos/mpn/addlsh_n.s @@ -89,6 +89,7 @@ + diff --git a/ext/gmp/gen/x86_64-macos/mpn/addmul_1.s b/ext/gmp/gen/x86_64-macos/mpn/addmul_1.s index b884829fe2..0435e8ae76 100644 --- a/ext/gmp/gen/x86_64-macos/mpn/addmul_1.s +++ b/ext/gmp/gen/x86_64-macos/mpn/addmul_1.s @@ -69,6 +69,8 @@ + + diff --git a/ext/gmp/gen/x86_64-macos/mpn/mul_basecase.s b/ext/gmp/gen/x86_64-macos/mpn/mul_basecase.s index 0e74e66cde..9d24fda96c 100644 --- a/ext/gmp/gen/x86_64-macos/mpn/mul_basecase.s +++ b/ext/gmp/gen/x86_64-macos/mpn/mul_basecase.s @@ -311,8 +311,7 @@ Ldone: ret -Lf2: - .byte 0xc4,98,171,0xf6,94,248 +Lf2: .byte 0xc4,98,171,0xf6,94,248 lea 8(%rdi,%rbx,8), %rdi .byte 0xc4,98,155,0xf6,14 diff --git a/ext/gmp/gen/x86_64-macos/mpn/rsblsh_n.s b/ext/gmp/gen/x86_64-macos/mpn/rsblsh_n.s index 0117c0d9c2..329c600b48 100644 --- a/ext/gmp/gen/x86_64-macos/mpn/rsblsh_n.s +++ b/ext/gmp/gen/x86_64-macos/mpn/rsblsh_n.s @@ -90,6 +90,7 @@ + diff --git a/ext/gmp/gen/x86_64-macos/mpn/sbpi1_bdiv_r.s b/ext/gmp/gen/x86_64-macos/mpn/sbpi1_bdiv_r.s new file mode 100644 index 0000000000..dcf3376688 --- /dev/null +++ b/ext/gmp/gen/x86_64-macos/mpn/sbpi1_bdiv_r.s @@ -0,0 +1,681 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_sbpi1_bdiv_r + + +___gmpn_sbpi1_bdiv_r: + + + + + lea Latab(%rip), %r10 + + cmp $8, %rcx + jbe Lsma + + + +Lgen: push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + mov %rdx, %r14 + xor %r13, %r13 + + sub %rcx, %rsi + + lea -8(,%rcx,8), %rbx + neg %rbx + mov %rcx, %rbp + mov %ecx, %eax + shr $3, %rbp + and $7, %eax + + movslq (%r10,%rax,4), %rax + lea (%rax,%r10), %rax + + mov (%rdi), %rdx + imul %r8, %rdx + jmp Louter + +Lf0: .byte 0xc4,66,171,0xf6,30 + lea -1(%rcx), %rcx + .byte 0xc4,66,155,0xf6,78,8 + lea -8(%r14), %r14 + .byte 0x66,77,0x0f,0x38,0xf6,227 + .byte 0xf3,76,0x0f,0x38,0xf6,23 + lea -8(%rdi), %rdi + jmp Lb0x + +Lf3: .byte 0xc4,66,155,0xf6,14 + .byte 0xc4,66,171,0xf6,94,8 + .byte 0xf3,76,0x0f,0x38,0xf6,39 + lea -48(%rdi), %rdi + lea 16(%r14), %r14 + jmp Lb3x + +Lf4: .byte 0xc4,66,171,0xf6,30 + .byte 0xc4,66,155,0xf6,78,8 + lea 24(%r14), %r14 + .byte 0xf3,76,0x0f,0x38,0xf6,23 + lea -40(%rdi), %rdi + .byte 0x66,77,0x0f,0x38,0xf6,227 + jmp Lb4x + +Lf5: .byte 0xc4,66,155,0xf6,14 + .byte 0xc4,66,171,0xf6,94,8 + lea 32(%r14), %r14 + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xf3,76,0x0f,0x38,0xf6,39 + lea -32(%rdi), %rdi + jmp Lb5x + +Lf6: .byte 0xc4,66,171,0xf6,30 + .byte 0xc4,66,155,0xf6,78,8 + lea 40(%r14), %r14 + .byte 0xf3,76,0x0f,0x38,0xf6,23 + lea -24(%rdi), %rdi + .byte 0x66,77,0x0f,0x38,0xf6,227 + jmp Lb6x + +Lf7: .byte 0xc4,66,155,0xf6,14 + .byte 0xc4,66,171,0xf6,94,8 + lea 48(%r14), %r14 + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xf3,76,0x0f,0x38,0xf6,39 + lea -16(%rdi), %rdi + jmp Lb7x + +Lf1: .byte 0xc4,66,155,0xf6,14 + .byte 0xc4,66,171,0xf6,94,8 + .byte 0xf3,76,0x0f,0x38,0xf6,39 + lea -1(%rcx), %rcx + jmp Lb1x + +Lf2: .byte 0xc4,66,171,0xf6,30 + .byte 0xc4,66,155,0xf6,78,8 + lea 8(%r14), %r14 + .byte 0xf3,76,0x0f,0x38,0xf6,23 + lea 8(%rdi), %rdi + .byte 0x66,77,0x0f,0x38,0xf6,227 + jmp Lb2x + +Lend: .byte 0xf3,76,0x0f,0x38,0xf6,39 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + mov %r12, (%rdi) + adc %rcx, %r9 + mov 8(%rdi,%rbx), %rdx + .byte 0xc4,66,235,0xf6,224 + bt $0, %r13d + adc %r9, 8(%rdi) + setc %r13b + dec %rsi + jz Ldone + + lea (%r14,%rbx), %r14 + lea 8(%rdi,%rbx), %rdi +Louter: + mov %rbp, %rcx + test %eax, %eax + jmp *%rax + + .align 4, 0x90 +Ltop: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r10, -8(%rdi) + jrcxz Lend +Lb2x: .byte 0xc4,66,171,0xf6,94,8 + .byte 0xf3,76,0x0f,0x38,0xf6,39 + lea -1(%rcx), %rcx + mov %r12, (%rdi) +Lb1x: .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xc4,66,155,0xf6,78,16 + .byte 0x66,77,0x0f,0x38,0xf6,227 + .byte 0xf3,76,0x0f,0x38,0xf6,87,8 + mov %r10, 8(%rdi) +Lb0x: .byte 0xc4,66,171,0xf6,94,24 + lea 64(%r14), %r14 + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xf3,76,0x0f,0x38,0xf6,103,16 + mov %r12, 16(%rdi) +Lb7x: .byte 0xc4,66,155,0xf6,78,224 + .byte 0xf3,76,0x0f,0x38,0xf6,87,24 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r10, 24(%rdi) +Lb6x: .byte 0xc4,66,171,0xf6,94,232 + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xf3,76,0x0f,0x38,0xf6,103,32 + mov %r12, 32(%rdi) +Lb5x: .byte 0xc4,66,155,0xf6,78,240 + .byte 0xf3,76,0x0f,0x38,0xf6,87,40 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r10, 40(%rdi) +Lb4x: .byte 0xf3,76,0x0f,0x38,0xf6,103,48 + .byte 0xc4,66,171,0xf6,94,248 + mov %r12, 48(%rdi) +Lb3x: lea 64(%rdi), %rdi + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xc4,66,155,0xf6,14 + jmp Ltop + +Ldone:mov %r13, %rax + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + +Lsma: + movslq 28(%r10,%rcx,4), %rax + lea (%rax,%r10), %rax + + jmp *%rax + +L1: mov (%rdx), %r10 + xor %eax, %eax + mov (%rdi), %rdx + dec %rsi + mov %rdx, %r9 +Lo1: .byte 0xc4,66,235,0xf6,216 + lea 8(%rdi), %rdi + .byte 0xc4,194,243,0xf6,210 + add %r9, %rcx + adc %rax, %rdx + add (%rdi), %rdx + setc %al + mov %rdx, %r9 + dec %rsi + jnz Lo1 + mov %r9, (%rdi) + + + ret + + +L2: push %r12 + push %r14 + + mov %rdx, %r14 + sub %rcx, %rsi + mov (%rdi), %rdx + imul %r8, %rdx + + + + push %rbx + push %r13 + xor %r13d, %r13d + mov (%rdi), %rax + mov 8(%rdi), %rbx +Lo2: xor %ecx, %ecx + .byte 0xc4,66,171,0xf6,30 + .byte 0xc4,66,235,0xf6,78,8 + .byte 0xf3,76,0x0f,0x38,0xf6,208 + .byte 0x66,73,0x0f,0x38,0xf6,211 + .byte 0xf3,72,0x0f,0x38,0xf6,211 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + mov %rdx, %rax + adc %rcx, %r9 + imul %r8, %rdx + bt $0, %r13d + adc 16(%rdi), %r9 + mov %r9, %rbx + setc %r13b + lea 8(%rdi), %rdi + dec %rsi + jnz Lo2 + + mov %rax, (%rdi) + mov %rbx, 8(%rdi) + mov %r13, %rax + pop %r13 + pop %rbx + + + + pop %r14 + pop %r12 + + ret + + +L3: push %rbx + push %r12 + push %r13 + push %r14 + + mov %rdx, %r14 + xor %r13, %r13 + sub %rcx, %rsi + mov (%rdi), %rax + mov 8(%rdi), %rbx + mov %rax, %rdx + imul %r8, %rdx +Lo3: xor %ecx, %ecx + .byte 0xc4,66,155,0xf6,14 + .byte 0xf3,76,0x0f,0x38,0xf6,224 + .byte 0xc4,66,251,0xf6,94,8 + .byte 0x66,73,0x0f,0x38,0xf6,193 + .byte 0xf3,72,0x0f,0x38,0xf6,195 + .byte 0xc4,66,227,0xf6,78,16 + mov %r8, %rdx + .byte 0xc4,98,235,0xf6,224 + .byte 0x66,73,0x0f,0x38,0xf6,219 + .byte 0xf3,72,0x0f,0x38,0xf6,95,16 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + adc $0, %r9 + bt $0, %r13d + adc %r9, 24(%rdi) + setc %r13b + lea 8(%rdi), %rdi + dec %rsi + jnz Lo3 + jmp Lesma + + + +L4: push %rbx + push %r12 + push %r13 + push %r14 + + mov %rdx, %r14 + xor %r13, %r13 + sub %rcx, %rsi + mov (%rdi), %rax + mov 8(%rdi), %rbx + mov %rax, %rdx + imul %r8, %rdx +Lo4: xor %ecx, %ecx + .byte 0xc4,66,171,0xf6,30 + .byte 0xf3,76,0x0f,0x38,0xf6,208 + .byte 0xc4,66,251,0xf6,78,8 + .byte 0x66,73,0x0f,0x38,0xf6,195 + .byte 0xf3,72,0x0f,0x38,0xf6,195 + .byte 0xc4,66,227,0xf6,94,16 + .byte 0x66,73,0x0f,0x38,0xf6,217 + .byte 0xc4,66,155,0xf6,78,24 + mov %r8, %rdx + .byte 0xc4,98,235,0xf6,208 + .byte 0xf3,72,0x0f,0x38,0xf6,95,16 + .byte 0x66,77,0x0f,0x38,0xf6,227 + .byte 0xf3,76,0x0f,0x38,0xf6,103,24 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + mov %r12, 24(%rdi) + adc %rcx, %r9 + bt $0, %r13d + adc %r9, 32(%rdi) + setc %r13b + lea 8(%rdi), %rdi + dec %rsi + jnz Lo4 + jmp Lesma + + + +L5: push %rbx + push %r12 + push %r13 + push %r14 + + mov %rdx, %r14 + xor %r13, %r13 + sub %rcx, %rsi + mov (%rdi), %rax + mov 8(%rdi), %rbx + mov %rax, %rdx + imul %r8, %rdx +Lo5: xor %ecx, %ecx + .byte 0xc4,66,155,0xf6,14 + .byte 0xf3,76,0x0f,0x38,0xf6,224 + .byte 0xc4,66,251,0xf6,94,8 + .byte 0x66,73,0x0f,0x38,0xf6,193 + .byte 0xf3,72,0x0f,0x38,0xf6,195 + .byte 0xc4,66,227,0xf6,78,16 + .byte 0x66,73,0x0f,0x38,0xf6,219 + .byte 0xf3,72,0x0f,0x38,0xf6,95,16 + .byte 0xc4,66,171,0xf6,94,24 + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xc4,66,155,0xf6,78,32 + .byte 0xf3,76,0x0f,0x38,0xf6,87,24 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r8, %rdx + .byte 0xc4,98,235,0xf6,216 + mov %r10, 24(%rdi) + .byte 0xf3,76,0x0f,0x38,0xf6,103,32 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + mov %r12, 32(%rdi) + adc %rcx, %r9 + bt $0, %r13d + adc %r9, 40(%rdi) + setc %r13b + lea 8(%rdi), %rdi + dec %rsi + jnz Lo5 + jmp Lesma + + + +L6: push %rbx + push %r12 + push %r13 + push %r14 + + mov %rdx, %r14 + xor %r13, %r13 + sub %rcx, %rsi + mov (%rdi), %rax + mov 8(%rdi), %rbx + mov %rax, %rdx + imul %r8, %rdx +Lo6: xor %ecx, %ecx + .byte 0xc4,66,171,0xf6,30 + .byte 0xf3,76,0x0f,0x38,0xf6,208 + .byte 0xc4,66,251,0xf6,78,8 + .byte 0x66,73,0x0f,0x38,0xf6,195 + .byte 0xf3,72,0x0f,0x38,0xf6,195 + .byte 0xc4,66,227,0xf6,94,16 + .byte 0x66,73,0x0f,0x38,0xf6,217 + .byte 0xc4,66,155,0xf6,78,24 + .byte 0xf3,72,0x0f,0x38,0xf6,95,16 + .byte 0x66,77,0x0f,0x38,0xf6,227 + .byte 0xf3,76,0x0f,0x38,0xf6,103,24 + .byte 0xc4,66,171,0xf6,94,32 + mov %r12, 24(%rdi) + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xc4,66,155,0xf6,78,40 + .byte 0xf3,76,0x0f,0x38,0xf6,87,32 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r8, %rdx + .byte 0xc4,98,235,0xf6,216 + mov %r10, 32(%rdi) + .byte 0xf3,76,0x0f,0x38,0xf6,103,40 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + mov %r12, 40(%rdi) + adc %rcx, %r9 + bt $0, %r13d + adc %r9, 48(%rdi) + setc %r13b + lea 8(%rdi), %rdi + dec %rsi + jnz Lo6 + jmp Lesma + + + +L7: push %rbx + push %r12 + push %r13 + push %r14 + + mov %rdx, %r14 + xor %r13, %r13 + sub %rcx, %rsi + mov (%rdi), %rax + mov 8(%rdi), %rbx + mov %rax, %rdx + imul %r8, %rdx +Lo7: xor %ecx, %ecx + .byte 0xc4,66,155,0xf6,14 + .byte 0xf3,76,0x0f,0x38,0xf6,224 + .byte 0xc4,66,251,0xf6,94,8 + .byte 0x66,73,0x0f,0x38,0xf6,193 + .byte 0xf3,72,0x0f,0x38,0xf6,195 + .byte 0xc4,66,227,0xf6,78,16 + .byte 0x66,73,0x0f,0x38,0xf6,219 + .byte 0xc4,66,171,0xf6,94,24 + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xf3,72,0x0f,0x38,0xf6,95,16 + .byte 0xc4,66,155,0xf6,78,32 + .byte 0xf3,76,0x0f,0x38,0xf6,87,24 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r10, 24(%rdi) + .byte 0xf3,76,0x0f,0x38,0xf6,103,32 + .byte 0xc4,66,171,0xf6,94,40 + mov %r12, 32(%rdi) + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xc4,66,155,0xf6,78,48 + .byte 0xf3,76,0x0f,0x38,0xf6,87,40 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r10, 40(%rdi) + mov %rax, %rdx + .byte 0xc4,66,235,0xf6,208 + .byte 0xf3,76,0x0f,0x38,0xf6,103,48 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + mov %r12, 48(%rdi) + adc %rcx, %r9 + bt $0, %r13d + adc %r9, 56(%rdi) + setc %r13b + lea 8(%rdi), %rdi + dec %rsi + jnz Lo7 + jmp Lesma + + + +L8: push %rbx + push %r12 + push %r13 + push %r14 + + mov %rdx, %r14 + xor %r13, %r13 + sub %rcx, %rsi + mov (%rdi), %rax + mov 8(%rdi), %rbx + mov %rax, %rdx + imul %r8, %rdx +Lo8: xor %ecx, %ecx + .byte 0xc4,66,171,0xf6,30 + .byte 0xf3,76,0x0f,0x38,0xf6,208 + .byte 0xc4,66,251,0xf6,78,8 + .byte 0x66,73,0x0f,0x38,0xf6,195 + .byte 0xf3,72,0x0f,0x38,0xf6,195 + .byte 0xc4,66,227,0xf6,94,16 + .byte 0x66,73,0x0f,0x38,0xf6,217 + .byte 0xc4,66,155,0xf6,78,24 + .byte 0xf3,72,0x0f,0x38,0xf6,95,16 + .byte 0x66,77,0x0f,0x38,0xf6,227 + .byte 0xc4,66,171,0xf6,94,32 + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xf3,76,0x0f,0x38,0xf6,103,24 + mov %r12, 24(%rdi) + .byte 0xc4,66,155,0xf6,78,40 + .byte 0xf3,76,0x0f,0x38,0xf6,87,32 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r10, 32(%rdi) + .byte 0xf3,76,0x0f,0x38,0xf6,103,40 + .byte 0xc4,66,171,0xf6,94,48 + mov %r12, 40(%rdi) + .byte 0x66,77,0x0f,0x38,0xf6,209 + .byte 0xc4,66,155,0xf6,78,56 + .byte 0xf3,76,0x0f,0x38,0xf6,87,48 + .byte 0x66,77,0x0f,0x38,0xf6,227 + mov %r8, %rdx + .byte 0xc4,98,235,0xf6,216 + mov %r10, 48(%rdi) + .byte 0xf3,76,0x0f,0x38,0xf6,103,56 + .byte 0xf3,76,0x0f,0x38,0xf6,201 + mov %r12, 56(%rdi) + adc %rcx, %r9 + bt $0, %r13d + adc %r9, 64(%rdi) + setc %r13b + lea 8(%rdi), %rdi + dec %rsi + jnz Lo8 + jmp Lesma + + +Lesma:mov %rax, (%rdi) + mov %rbx, 8(%rdi) + mov %r13, %rax + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + ret + + + .text + .align 3, 0x90 +Latab:.set Lf0_tmp, Lf0-Latab + .long Lf0_tmp + + .set Lf1_tmp, Lf1-Latab + .long Lf1_tmp + + .set Lf2_tmp, Lf2-Latab + .long Lf2_tmp + + .set Lf3_tmp, Lf3-Latab + .long Lf3_tmp + + .set Lf4_tmp, Lf4-Latab + .long Lf4_tmp + + .set Lf5_tmp, Lf5-Latab + .long Lf5_tmp + + .set Lf6_tmp, Lf6-Latab + .long Lf6_tmp + + .set Lf7_tmp, Lf7-Latab + .long Lf7_tmp + + .set L1_tmp, L1-Latab + .long L1_tmp + + .set L2_tmp, L2-Latab + .long L2_tmp + + .set L3_tmp, L3-Latab + .long L3_tmp + + .set L4_tmp, L4-Latab + .long L4_tmp + + .set L5_tmp, L5-Latab + .long L5_tmp + + .set L6_tmp, L6-Latab + .long L6_tmp + + .set L7_tmp, L7-Latab + .long L7_tmp + + .set L8_tmp, L8-Latab + .long L8_tmp + + .text + diff --git a/ext/gmp/gen/x86_64-macos/sieve_table.h b/ext/gmp/gen/x86_64-macos/sieve_table.h new file mode 100644 index 0000000000..ee9ac14360 --- /dev/null +++ b/ext/gmp/gen/x86_64-macos/sieve_table.h @@ -0,0 +1,46 @@ +/* This file generated by gen-sieve.c - DO NOT EDIT. */ + +#if GMP_LIMB_BITS != 64 +Error, error, this data is for 64 bits +#endif + +#define PRIMESIEVE_INIT_TABLE \ + CNST_LIMB (0x3294C9E069128480), /* 5 - 196 (42 primes) */ \ + CNST_LIMB (0x95A35E1EC4AB21DC), /* 197 - 388 (32 primes) */ \ + CNST_LIMB (0x4AD7CE99B8693366), /* 389 - 580 (30 primes) */ \ + CNST_LIMB (0x6595B6DA728DC52B), /* 581 - 772 (30 primes) */ \ + CNST_LIMB (0xEA6D9F8787B0CEDE), /* 773 - 964 (26 primes) */ \ + CNST_LIMB (0x3F56A1F4CD3275A9), /* 965 - 1156 (29 primes) */ \ + CNST_LIMB (0xFD3848FB74A76ADB), /* 1157 - 1348 (26 primes) */ \ + CNST_LIMB (0xDBBA0DD1A1EDF6AF), /* 1349 - 1540 (25 primes) */ \ + CNST_LIMB (0xCEC7F17ED22799A5), /* 1541 - 1732 (27 primes) */ \ + CNST_LIMB (0xEAEC17BDBB717D56), /* 1733 - 1924 (24 primes) */ \ + CNST_LIMB (0x3B0EB7B3585AFCF3), /* 1925 - 2116 (26 primes) */ \ + CNST_LIMB (0xE563D8F69FDF6C4F), /* 2117 - 2308 (23 primes) */ \ + CNST_LIMB (0xFE5BA7ABA45E92FC), /* 2309 - 2500 (25 primes) */ \ + CNST_LIMB (0x158DEE6F3BF49B7D), /* 2501 - 2692 (24 primes) */ \ + CNST_LIMB (0xBE5A7BC4EDE6CD1A), /* 2693 - 2884 (26 primes) */ \ + CNST_LIMB (0xD7679B3FCA7BB6AD), /* 2885 - 3076 (22 primes) */ \ + CNST_LIMB (0xC3F66B971FEF37E9), /* 3077 - 3268 (22 primes) */ \ + CNST_LIMB (0x6F7EBCF339C953FD), /* 3269 - 3460 (22 primes) */ \ + CNST_LIMB (0xD5A5ECDCD235DBF0), /* 3461 - 3652 (27 primes) */ \ + CNST_LIMB (0xECFA7B2FD5B65E3B), /* 3653 - 3844 (22 primes) */ \ + CNST_LIMB (0xD28EFDF9C89F67B1), /* 3845 - 4036 (25 primes) */ \ + CNST_LIMB (0xCB7F7C7A3DD3AF4F), /* 4037 - 4228 (21 primes) */ \ + CNST_LIMB (0xEEBED6CDFF6B32CC), /* 4229 - 4420 (22 primes) */ \ + CNST_LIMB (0xD5BD73F85ECFA97C), /* 4421 - 4612 (23 primes) */ \ + CNST_LIMB (0x21FDBE4FBBAD48F7), /* 4613 - 4804 (24 primes) */ \ + CNST_LIMB (0x5E35A3B5EEB7FDE7), /* 4805 - 4996 (21 primes) */ \ + CNST_LIMB (0xD9EBFD53A7DBBCC9), /* 4997 - 5188 (22 primes) */ \ + CNST_LIMB (0xFF9EDEAF2EFE1F76), /* 5189 - 5380 (18 primes) */ +#define PRIMESIEVE_NUMBEROF_TABLE 28 +/* #define PRIMESIEVE_PRIMES_IN_TABLE 706 */ +#define PRIMESIEVE_HIGHEST_PRIME 5351 +/* #define PRIMESIEVE_FIRST_UNCHECKED 5381 */ + +#define SIEVE_MASK1 CNST_LIMB(0x3204C1A049120485) +#define SIEVE_MASKT CNST_LIMB(0xA1204892058) +#define SIEVE_2MSK1 CNST_LIMB(0x29048402110840A) +#define SIEVE_2MSK2 CNST_LIMB(0x9402180C40230184) +#define SIEVE_2MSKT CNST_LIMB(0x5021088402120) + From 1d0905ba3d686266966df355708aeec400e30942 Mon Sep 17 00:00:00 2001 From: pkova Date: Thu, 17 Oct 2024 17:12:25 +0300 Subject: [PATCH 94/97] build: remember to copy sieve_table.h in gmp build readme --- ext/gmp/gen/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/ext/gmp/gen/README.md b/ext/gmp/gen/README.md index 6228292b1b..52bdd96af7 100644 --- a/ext/gmp/gen/README.md +++ b/ext/gmp/gen/README.md @@ -37,6 +37,7 @@ Now, under the GMP root dir run `make` and copy these files as well: - `fac_table.h` - `fib_table.h` - `trialdivtab.h` +- `sieve_table.h` - `mpn/fib_table.c` - `mpn/jacobitab.h` - `mpn/mp_bases.c` From dd362b39ebdefb064c4f0cc32e1133fedee336b2 Mon Sep 17 00:00:00 2001 From: pkova Date: Thu, 17 Oct 2024 17:12:49 +0300 Subject: [PATCH 95/97] ci: fix paths to make ci run --- .github/workflows/feature.yml | 11 ++++++----- .github/workflows/master.yml | 13 ++++++------- .github/workflows/next.yml | 12 ++++++------ .github/workflows/release.yml | 12 ++++++------ 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/workflows/feature.yml b/.github/workflows/feature.yml index 60444b89e3..d78b911e13 100644 --- a/.github/workflows/feature.yml +++ b/.github/workflows/feature.yml @@ -3,12 +3,13 @@ name: Feature pull request on: pull_request: paths: - - '.bazelrc' - - '.github/workflows/*.yml' - - 'BUILD.bazel' - - 'WORKSPACE.bazel' - - 'bazel/**' + - 'build.zig' + - 'build.zig.zon' + - 'ext/**' + - '!ext/**.md' - 'pkg/**' + - '.github/workflows/**.yml' + - '*.sh' jobs: urbit: diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 368c417b4f..02a2b84c58 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -5,14 +5,13 @@ on: branches: - master paths: - - '*.bazel' - - '.bazelrc' - - '.github/workflows/**.yml' - - 'PACE' - - 'VERSION' - - 'bazel/**' + - 'build.zig' + - 'build.zig.zon' + - 'ext/**' + - '!ext/**.md' - 'pkg/**' - + - '.github/workflows/**.yml' + - '*.sh' jobs: urbit: diff --git a/.github/workflows/next.yml b/.github/workflows/next.yml index 07250449e3..71738ad940 100644 --- a/.github/workflows/next.yml +++ b/.github/workflows/next.yml @@ -5,13 +5,13 @@ on: branches: - 'next/kelvin/*' paths: - - '*.bazel' - - '.bazelrc' - - '.github/workflows/**.yml' - - 'PACE' - - 'VERSION' - - 'bazel/**' + - 'build.zig' + - 'build.zig.zon' + - 'ext/**' + - '!ext/**.md' - 'pkg/**' + - '.github/workflows/**.yml' + - '*.sh' jobs: urbit: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 904884d5c1..ca4a907b07 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -5,13 +5,13 @@ on: branches: - release paths: - - '*.bazel' - - '.bazelrc' - - '.github/workflows/**.yml' - - 'PACE' - - 'VERSION' - - 'bazel/**' + - 'build.zig' + - 'build.zig.zon' + - 'ext/**' + - '!ext/**.md' - 'pkg/**' + - '.github/workflows/**.yml' + - '*.sh' jobs: urbit: From fb807c160d65ae532a38126754682d2780dc1ba1 Mon Sep 17 00:00:00 2001 From: pkova Date: Thu, 17 Oct 2024 17:24:43 +0300 Subject: [PATCH 96/97] build: forgot to commit some gmp files --- ext/gmp/gen/aarch64-macos/mpn/divrem_1.s | 235 ++++++++++++++++++++++ ext/gmp/gen/aarch64-macos/mpn/tmp-add_n.s | 134 ++++++++++++ ext/gmp/gen/aarch64-macos/sieve_table.h | 46 +++++ 3 files changed, 415 insertions(+) create mode 100644 ext/gmp/gen/aarch64-macos/mpn/divrem_1.s create mode 100644 ext/gmp/gen/aarch64-macos/mpn/tmp-add_n.s create mode 100644 ext/gmp/gen/aarch64-macos/sieve_table.h diff --git a/ext/gmp/gen/aarch64-macos/mpn/divrem_1.s b/ext/gmp/gen/aarch64-macos/mpn/divrem_1.s new file mode 100644 index 0000000000..3d4ca8c84f --- /dev/null +++ b/ext/gmp/gen/aarch64-macos/mpn/divrem_1.s @@ -0,0 +1,235 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 3 + .globl ___gmpn_preinv_divrem_1 + +___gmpn_preinv_divrem_1: + cbz x3, Lfz + stp x29, x30, [sp, #-80]! + mov x29, sp + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + + sub x21, x3, #1 + add x7, x21, x1 + add x20, x2, x21, lsl #3 + add x19, x0, x7, lsl #3 + mov x24, x1 + mov x22, x4 + mov x0, x5 + tbnz x4, #63, Lnentry + mov x23, x6 + b Luentry + + + .text + .align 3 + .globl ___gmpn_divrem_1 + +___gmpn_divrem_1: + cbz x3, Lfz + stp x29, x30, [sp, #-80]! + mov x29, sp + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + + sub x21, x3, #1 + add x7, x21, x1 + add x20, x2, x21, lsl #3 + add x19, x0, x7, lsl #3 + mov x24, x1 + mov x22, x4 + tbnz x4, #63, Lnormalised + +Lunnorm: + clz x23, x22 + lsl x0, x22, x23 + bl ___gmpn_invert_limb +Luentry: + lsl x22, x22, x23 + ldr x7, [x20], #-8 + sub x8, xzr, x23 + lsr x11, x7, x8 + lsl x1, x7, x23 + cbz x21, Luend + +Lutop:ldr x7, [x20], #-8 + add x2, x11, #1 + mul x10, x11, x0 + umulh x17, x11, x0 + lsr x9, x7, x8 + orr x1, x1, x9 + adds x10, x1, x10 + adc x2, x2, x17 + msub x11, x22, x2, x1 + lsl x1, x7, x23 + cmp x10, x11 + add x14, x11, x22 + csel x11, x14, x11, cc + sbc x2, x2, xzr + cmp x11, x22 + bcs Lufx +Luok: str x2, [x19], #-8 + sub x21, x21, #1 + cbnz x21, Lutop + +Luend:add x2, x11, #1 + mul x10, x11, x0 + umulh x17, x11, x0 + adds x10, x1, x10 + adc x2, x2, x17 + msub x11, x22, x2, x1 + cmp x10, x11 + add x14, x11, x22 + csel x11, x14, x11, cc + sbc x2, x2, xzr + subs x14, x11, x22 + adc x2, x2, xzr + csel x11, x14, x11, cs + str x2, [x19], #-8 + + cbnz x24, Lftop + lsr x0, x11, x23 + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x29, x30, [sp], #80 + ret + +Lufx: add x2, x2, #1 + sub x11, x11, x22 + b Luok + + +Lnormalised: + mov x0, x22 + bl ___gmpn_invert_limb +Lnentry: + ldr x7, [x20], #-8 + subs x14, x7, x22 + adc x2, xzr, xzr + csel x11, x14, x7, cs + b Lnok + +Lntop:ldr x1, [x20], #-8 + add x2, x11, #1 + mul x10, x11, x0 + umulh x17, x11, x0 + adds x10, x1, x10 + adc x2, x2, x17 + msub x11, x22, x2, x1 + cmp x10, x11 + add x14, x11, x22 + csel x11, x14, x11, cc + sbc x2, x2, xzr + cmp x11, x22 + bcs Lnfx +Lnok: str x2, [x19], #-8 + sub x21, x21, #1 + tbz x21, #63, Lntop + +Lnend:cbnz x24, Lfrac + mov x0, x11 + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x29, x30, [sp], #80 + ret + +Lnfx: add x2, x2, #1 + sub x11, x11, x22 + b Lnok + +Lfrac:mov x23, #0 +Lftop:add x2, x11, #1 + mul x10, x11, x0 + umulh x17, x11, x0 + add x2, x2, x17 + msub x11, x22, x2, xzr + cmp x10, x11 + add x14, x11, x22 + csel x11, x14, x11, cc + sbc x2, x2, xzr + str x2, [x19], #-8 + sub x24, x24, #1 + cbnz x24, Lftop + + lsr x0, x11, x23 + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x29, x30, [sp], #80 + ret + + +Lfz: cbz x1, Lzend +Lztop:str xzr, [x0], #8 + sub x1, x1, #1 + cbnz x1, Lztop +Lzend:mov x0, #0 + ret + diff --git a/ext/gmp/gen/aarch64-macos/mpn/tmp-add_n.s b/ext/gmp/gen/aarch64-macos/mpn/tmp-add_n.s new file mode 100644 index 0000000000..5f2b539e98 --- /dev/null +++ b/ext/gmp/gen/aarch64-macos/mpn/tmp-add_n.s @@ -0,0 +1,134 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 3 + .globl ___gmpn_add_nc + +___gmpn_add_nc: + cmp x4, #1 + b Lent + + .text + .align 3 + .globl ___gmpn_add_n + +___gmpn_add_n: + cmn xzr, xzr +Lent: lsr x17, x3, #2 + tbz x3, #0, Lbx0 + +Lbx1: ldr x7, [x1] + ldr x11, [x2] + adcs x13, x7, x11 + str x13, [x0],#8 + tbnz x3, #1, Lb11 + +Lb01: cbz x17, Lret + ldp x4, x5, [x1,#8] + ldp x8, x9, [x2,#8] + sub x1, x1, #8 + sub x2, x2, #8 + b Lmid + +Lb11: ldp x6, x7, [x1,#8] + ldp x10, x11, [x2,#8] + add x1, x1, #8 + add x2, x2, #8 + cbz x17, Lend + b Ltop + +Lbx0: tbnz x3, #1, Lb10 + +Lb00: ldp x4, x5, [x1] + ldp x8, x9, [x2] + sub x1, x1, #16 + sub x2, x2, #16 + b Lmid + +Lb10: ldp x6, x7, [x1] + ldp x10, x11, [x2] + cbz x17, Lend + + .align 4 +Ltop: ldp x4, x5, [x1,#16] + ldp x8, x9, [x2,#16] + adcs x12, x6, x10 + adcs x13, x7, x11 + stp x12, x13, [x0],#16 +Lmid: ldp x6, x7, [x1,#32]! + ldp x10, x11, [x2,#32]! + adcs x12, x4, x8 + adcs x13, x5, x9 + stp x12, x13, [x0],#16 + sub x17, x17, #1 + cbnz x17, Ltop + +Lend: adcs x12, x6, x10 + adcs x13, x7, x11 + stp x12, x13, [x0] +Lret: cset x0, cs + ret + diff --git a/ext/gmp/gen/aarch64-macos/sieve_table.h b/ext/gmp/gen/aarch64-macos/sieve_table.h new file mode 100644 index 0000000000..ee9ac14360 --- /dev/null +++ b/ext/gmp/gen/aarch64-macos/sieve_table.h @@ -0,0 +1,46 @@ +/* This file generated by gen-sieve.c - DO NOT EDIT. */ + +#if GMP_LIMB_BITS != 64 +Error, error, this data is for 64 bits +#endif + +#define PRIMESIEVE_INIT_TABLE \ + CNST_LIMB (0x3294C9E069128480), /* 5 - 196 (42 primes) */ \ + CNST_LIMB (0x95A35E1EC4AB21DC), /* 197 - 388 (32 primes) */ \ + CNST_LIMB (0x4AD7CE99B8693366), /* 389 - 580 (30 primes) */ \ + CNST_LIMB (0x6595B6DA728DC52B), /* 581 - 772 (30 primes) */ \ + CNST_LIMB (0xEA6D9F8787B0CEDE), /* 773 - 964 (26 primes) */ \ + CNST_LIMB (0x3F56A1F4CD3275A9), /* 965 - 1156 (29 primes) */ \ + CNST_LIMB (0xFD3848FB74A76ADB), /* 1157 - 1348 (26 primes) */ \ + CNST_LIMB (0xDBBA0DD1A1EDF6AF), /* 1349 - 1540 (25 primes) */ \ + CNST_LIMB (0xCEC7F17ED22799A5), /* 1541 - 1732 (27 primes) */ \ + CNST_LIMB (0xEAEC17BDBB717D56), /* 1733 - 1924 (24 primes) */ \ + CNST_LIMB (0x3B0EB7B3585AFCF3), /* 1925 - 2116 (26 primes) */ \ + CNST_LIMB (0xE563D8F69FDF6C4F), /* 2117 - 2308 (23 primes) */ \ + CNST_LIMB (0xFE5BA7ABA45E92FC), /* 2309 - 2500 (25 primes) */ \ + CNST_LIMB (0x158DEE6F3BF49B7D), /* 2501 - 2692 (24 primes) */ \ + CNST_LIMB (0xBE5A7BC4EDE6CD1A), /* 2693 - 2884 (26 primes) */ \ + CNST_LIMB (0xD7679B3FCA7BB6AD), /* 2885 - 3076 (22 primes) */ \ + CNST_LIMB (0xC3F66B971FEF37E9), /* 3077 - 3268 (22 primes) */ \ + CNST_LIMB (0x6F7EBCF339C953FD), /* 3269 - 3460 (22 primes) */ \ + CNST_LIMB (0xD5A5ECDCD235DBF0), /* 3461 - 3652 (27 primes) */ \ + CNST_LIMB (0xECFA7B2FD5B65E3B), /* 3653 - 3844 (22 primes) */ \ + CNST_LIMB (0xD28EFDF9C89F67B1), /* 3845 - 4036 (25 primes) */ \ + CNST_LIMB (0xCB7F7C7A3DD3AF4F), /* 4037 - 4228 (21 primes) */ \ + CNST_LIMB (0xEEBED6CDFF6B32CC), /* 4229 - 4420 (22 primes) */ \ + CNST_LIMB (0xD5BD73F85ECFA97C), /* 4421 - 4612 (23 primes) */ \ + CNST_LIMB (0x21FDBE4FBBAD48F7), /* 4613 - 4804 (24 primes) */ \ + CNST_LIMB (0x5E35A3B5EEB7FDE7), /* 4805 - 4996 (21 primes) */ \ + CNST_LIMB (0xD9EBFD53A7DBBCC9), /* 4997 - 5188 (22 primes) */ \ + CNST_LIMB (0xFF9EDEAF2EFE1F76), /* 5189 - 5380 (18 primes) */ +#define PRIMESIEVE_NUMBEROF_TABLE 28 +/* #define PRIMESIEVE_PRIMES_IN_TABLE 706 */ +#define PRIMESIEVE_HIGHEST_PRIME 5351 +/* #define PRIMESIEVE_FIRST_UNCHECKED 5381 */ + +#define SIEVE_MASK1 CNST_LIMB(0x3204C1A049120485) +#define SIEVE_MASKT CNST_LIMB(0xA1204892058) +#define SIEVE_2MSK1 CNST_LIMB(0x29048402110840A) +#define SIEVE_2MSK2 CNST_LIMB(0x9402180C40230184) +#define SIEVE_2MSKT CNST_LIMB(0x5021088402120) + From f1d8477b9e4fa59903a5d0dede525c955624e1bf Mon Sep 17 00:00:00 2001 From: pkova Date: Thu, 17 Oct 2024 19:02:32 +0300 Subject: [PATCH 97/97] build: merge develop to next/kelvin/410, convert lagoon to zig build --- build.zig | 9 +++++ build.zig.zon | 3 ++ ext/softblas/build.zig | 80 ++++++++++++++++++++++++++++++++++++++ ext/softblas/build.zig.zon | 16 ++++++++ pkg/noun/jets/e/crc32.c | 2 +- pkg/noun/jets/i/lagoon.c | 7 ++-- 6 files changed, 113 insertions(+), 4 deletions(-) create mode 100644 ext/softblas/build.zig create mode 100644 ext/softblas/build.zig.zon diff --git a/build.zig b/build.zig index e6f1b25970..815aedeb69 100644 --- a/build.zig +++ b/build.zig @@ -283,6 +283,11 @@ fn build_single( .optimize = optimize, }); + const softblas = b.dependency("softblas", .{ + .target = target, + .optimize = optimize, + }); + const softfloat = b.dependency("softfloat", .{ .target = target, .optimize = optimize, @@ -485,10 +490,12 @@ fn build_single( pkg_noun.linkLibrary(pdjson.artifact("pdjson")); pkg_noun.linkLibrary(sigsegv.artifact("sigsegv")); pkg_noun.linkLibrary(softfloat.artifact("softfloat")); + pkg_noun.linkLibrary(softblas.artifact("softblas")); if (t.os.tag == .linux) pkg_noun.linkLibrary(unwind.artifact("unwind")); pkg_noun.linkLibrary(urcrypt.artifact("urcrypt")); pkg_noun.linkLibrary(whereami.artifact("whereami")); + pkg_noun.linkLibrary(zlib.artifact("z")); pkg_noun.linkLibC(); pkg_noun.addIncludePath(b.path("pkg/noun")); @@ -615,6 +622,7 @@ fn build_single( "jets/e/argon2.c", "jets/e/base.c", "jets/e/blake.c", + "jets/e/crc32.c", "jets/e/cue.c", "jets/e/ed_add_double_scalarmult.c", "jets/e/ed_add_scalarmult_scalarmult_base.c", @@ -678,6 +686,7 @@ fn build_single( "jets/f/ut_mull.c", "jets/f/ut_nest.c", "jets/f/ut_rest.c", + "jets/i/lagoon.c", "jets/tree.c", "log.c", "manage.c", diff --git a/build.zig.zon b/build.zig.zon index f39c710e39..3f6a8c07b7 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -49,6 +49,9 @@ .softfloat = .{ .path = "./ext/softfloat", }, + .softblas = .{ + .path = "./ext/softblas", + }, .unwind = .{ .path = "./ext/unwind", }, diff --git a/ext/softblas/build.zig b/ext/softblas/build.zig new file mode 100644 index 0000000000..1fb3b066b5 --- /dev/null +++ b/ext/softblas/build.zig @@ -0,0 +1,80 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) void { + const target = b.standardTargetOptions(.{}); + const optimize = b.standardOptimizeOption(.{}); + + const lib = b.addStaticLibrary(.{ + .name = "softblas", + .target = target, + .optimize = optimize, + }); + + const dep_c = b.dependency("softblas", .{ + .target = target, + .optimize = optimize, + }); + + const softfloat = b.dependency("softfloat", .{ + .target = target, + .optimize = optimize, + }); + + lib.addIncludePath(dep_c.path("include")); + + lib.addCSourceFiles(.{ + .root = dep_c.path(""), + .files = &.{ + "src/softblas_state.c", + "src/blas/level1/sasum.c", + "src/blas/level1/dasum.c", + "src/blas/level1/hasum.c", + "src/blas/level1/qasum.c", + "src/blas/level1/saxpy.c", + "src/blas/level1/daxpy.c", + "src/blas/level1/haxpy.c", + "src/blas/level1/qaxpy.c", + "src/blas/level1/scopy.c", + "src/blas/level1/dcopy.c", + "src/blas/level1/hcopy.c", + "src/blas/level1/qcopy.c", + "src/blas/level1/sdot.c", + "src/blas/level1/ddot.c", + "src/blas/level1/hdot.c", + "src/blas/level1/qdot.c", + "src/blas/level1/snrm2.c", + "src/blas/level1/dnrm2.c", + "src/blas/level1/hnrm2.c", + "src/blas/level1/qnrm2.c", + "src/blas/level1/sscal.c", + "src/blas/level1/dscal.c", + "src/blas/level1/hscal.c", + "src/blas/level1/qscal.c", + "src/blas/level1/sswap.c", + "src/blas/level1/dswap.c", + "src/blas/level1/hswap.c", + "src/blas/level1/qswap.c", + "src/blas/level1/isamax.c", + "src/blas/level1/idamax.c", + "src/blas/level1/ihamax.c", + "src/blas/level1/iqamax.c", + "src/blas/level2/sgemv.c", + "src/blas/level2/dgemv.c", + "src/blas/level2/hgemv.c", + "src/blas/level2/qgemv.c", + "src/blas/level3/sgemm.c", + "src/blas/level3/dgemm.c", + "src/blas/level3/hgemm.c", + "src/blas/level3/qgemm.c", + }, + .flags = &.{ + "-fno-sanitize=all", + }, + }); + + lib.installHeader(dep_c.path("include/softblas.h"), "softblas.h"); + + lib.linkLibC(); + lib.linkLibrary(softfloat.artifact("softfloat")); + b.installArtifact(lib); +} diff --git a/ext/softblas/build.zig.zon b/ext/softblas/build.zig.zon new file mode 100644 index 0000000000..431171b53b --- /dev/null +++ b/ext/softblas/build.zig.zon @@ -0,0 +1,16 @@ +.{ + .name = "softblas", + .version = "0.0.1", + .dependencies = .{ + .softfloat = .{ + .path = "../softfloat", + }, + .softblas = .{ + .url = "https://github.com/urbit/SoftBLAS/archive/cbffb33f19ea02f9ffbd184d445123c57929ec53.tar.gz", + .hash = "1220617c11d869ef2316571a430f51f93470e2d714141deb3bdfaa6b578cf151f258", + }, + }, + .paths = .{ + "", + }, +} diff --git a/pkg/noun/jets/e/crc32.c b/pkg/noun/jets/e/crc32.c index 1afc473a30..0daa032c7f 100644 --- a/pkg/noun/jets/e/crc32.c +++ b/pkg/noun/jets/e/crc32.c @@ -21,7 +21,7 @@ u3qe_crc32(u3_noun input_octs) c3_y* input; if (c3y == u3a_is_cat(tail)) { - input = &tail; + input = (c3_y*)&tail; } else { u3a_atom* vat_u = u3a_to_ptr(tail); diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c index 13c0f2a138..db7088b96e 100644 --- a/pkg/noun/jets/i/lagoon.c +++ b/pkg/noun/jets/i/lagoon.c @@ -3,6 +3,8 @@ #include "jets/q.h" #include "jets/w.h" +#include "c3/motes.h" + #include "noun.h" #include "softfloat.h" #include "softblas.h" @@ -1628,7 +1630,7 @@ for (c3_d i = 0; i < len_x; i++) { float32_t x_val32 = ((float32_t*)x_bytes)[i]; // Perform division x/n - float32_t div_result32 = f32_mul((float32_t)in32, (float32_t)x_val32); + float32_t div_result32 = f32_mul(in32, x_val32); // Compute floor of the division result c3_ds floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false); float32_t floor_float32 = i64_to_f32(floor_result32); @@ -3284,13 +3286,12 @@ { return u3m_bail(c3__exit); } else { - u3_noun x_shape, x_bloq, x_kind, x_tail, + u3_noun x_shape, x_bloq, x_kind, y_shape, rnd; x_shape = u3h(x_meta); // 2 x_bloq = u3h(u3t(x_meta)); // 6 x_kind = u3h(u3t(u3t(x_meta))); // 14 - x_tail = u3t(u3t(u3t(x_meta))); // 15 y_shape = u3h(y_meta); // 2 rnd = u3h(u3t(u3t(u3t(cor)))); // 30 if ( c3n == _check(u3nc(x_meta, x_data)) ||