diff --git a/.github/workflows/ci-interpreter.yml b/.github/workflows/ci-interpreter.yml index 91a5fc423..6c09ffc0f 100644 --- a/.github/workflows/ci-interpreter.yml +++ b/.github/workflows/ci-interpreter.yml @@ -27,7 +27,7 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v2 with: - node-version: 19.x + node-version: 20.x - name: Build interpreter run: cd interpreter && opam exec make - name: Run tests diff --git a/document/core/appendix/changes.rst b/document/core/appendix/changes.rst index 39f40f9cf..352bc504a 100644 --- a/document/core/appendix/changes.rst +++ b/document/core/appendix/changes.rst @@ -518,6 +518,48 @@ Added managed reference types. [#proposal-gc]_ - |EXTERNCONVERTANY| +.. index:: instruction, vector instruction, SIMD + +Relaxed Vector Instructions +........................... + +Added new *relaxed* vector instructions, +whose behaviour is non-deterministic and implementation-dependent. [#proposal-relaxed]_ + +* New binary :ref:`vector instruction `: + + - :math:`\K{f}\!N\!\K{x}\!M\!\K{.relaxed\_min}` + - :math:`\K{f}\!N\!\K{x}\!M\!\K{.relaxed\_max}` + - :math:`\K{i16x8.relaxed\_q15mulr\_s}` + - :math:`\K{i16x8.relaxed\_dot\_i8x16\_i7x16\_s}` + +* New ternary :ref:`vector instruction `: + + - :math:`\K{f}\!N\!\K{x}\!M\!\K{.relaxed\_madd}` + - :math:`\K{f}\!N\!\K{x}\!M\!\K{.relaxed\_nmadd}` + - :math:`\K{i}\!N\!\K{x}\!M\!\K{.relaxed\_laneselect}` + - :math:`\K{i32x4.relaxed\_dot\_i8x16\_i7x16\_add\_s}` + +* New conversion :ref:`vector instructions `: + + - :math:`\K{i32x4.relaxed\_trunc\_f32x4\_}\sx` + - :math:`\K{i32x4.relaxed\_trunc\_f64x2\_}\sx\K{\_zero}` + +* New byte reordering :ref:`vector instruction `: + + - :math:`\K{i8x16.relaxed\_swizzle}` + + +.. index:: determinism, non-determinism, profiles + +Profiles +........ + +Introduced the concept of :ref:`profile ` for specifying language subsets. + +* A new profile defining a :ref:`deterministic ` mode of execution. + + .. index:: text format, annotation, custom section, identifier, module, type, function, local, structure field Custom Annotations @@ -542,7 +584,7 @@ mirroring the role of custom sections in the binary format. [#proposal-annot]_ .. [#proposal-extconst] - https://github.com/WebAssembly/extended-const/blob/main/proposals/extended-const/ + https://github.com/WebAssembly/spec/blob/main/proposals/extended-const/ .. [#proposal-tailcall] https://github.com/WebAssembly/spec/tree/main/proposals/tail-call/ @@ -551,7 +593,7 @@ mirroring the role of custom sections in the binary format. [#proposal-annot]_ https://github.com/WebAssembly/spec/tree/main/proposals/exception-handling/ .. [#proposal-multimem] - https://github.com/WebAssembly/multi-memory/blob/main/proposals/multi-memory/ + https://github.com/WebAssembly/spec/blob/main/proposals/multi-memory/ .. [#proposal-typedref] https://github.com/WebAssembly/spec/tree/main/proposals/function-references/ @@ -559,5 +601,8 @@ mirroring the role of custom sections in the binary format. [#proposal-annot]_ .. [#proposal-gc] https://github.com/WebAssembly/spec/tree/main/proposals/gc/ +.. [#proposal-relaxed] + https://github.com/WebAssembly/spec/tree/main/proposals/relaxed-simd/ + .. [#proposal-annot] https://github.com/WebAssembly/annotations/tree/main/proposals/annotations/ diff --git a/document/core/appendix/index-instructions.py b/document/core/appendix/index-instructions.py index 52ffaaf46..c39fc4a90 100755 --- a/document/core/appendix/index-instructions.py +++ b/document/core/appendix/index-instructions.py @@ -633,7 +633,26 @@ def Instruction(name, opcode, type=None, validation=None, execution=None, operat Instruction(r'\I32X4.\VTRUNC\K{\_sat\_f64x2\_u\_zero}', r'\hex{FD}~~\hex{FD}~~\hex{01}', r'[\V128] \to [\V128]', r'valid-vcvtop', r'exec-vcvtop', r'op-trunc_sat_u'), Instruction(r'\F64X2.\VCONVERT\K{\_low\_i32x4\_s}', r'\hex{FD}~~\hex{FE}~~\hex{01}', r'[\V128] \to [\V128]', r'valid-vcvtop', r'exec-vcvtop', r'op-convert_s'), Instruction(r'\F64X2.\VCONVERT\K{\_low\_i32x4\_u}', r'\hex{FD}~~\hex{FF}~~\hex{01}', r'[\V128] \to [\V128]', r'valid-vcvtop', r'exec-vcvtop', r'op-convert_u'), - Instruction(None, r'\hex{FD}~\hex{00}~\hex{02} \dots'), + Instruction(r'\I8X16.\RELAXEDSWIZZLE', r'\hex{FD}~~\hex{80}~~\hex{02}', r'[\V128~\V128] \to [\V128]', r'valid-relaxed_swizzle', r'exec-relaxed_swizzle', r'op-irelaxed_swizzle'), + Instruction(r'\I32X4.\RELAXEDTRUNC\K{\_f32x4\_s}', r'\hex{FD}~~\hex{81}~~\hex{02}', r'[\V128] \to [\V128]', r'valid-vcvtop', r'exec-vcvtop', r'op-relaxed_trunc_s'), + Instruction(r'\I32X4.\RELAXEDTRUNC\K{\_f32x4\_u}', r'\hex{FD}~~\hex{82}~~\hex{02}', r'[\V128] \to [\V128]', r'valid-vcvtop', r'exec-vcvtop', r'op-relaxed_trunc_u'), + Instruction(r'\I32X4.\RELAXEDTRUNC\K{\_f64x2\_s}', r'\hex{FD}~~\hex{83}~~\hex{02}', r'[\V128] \to [\V128]', r'valid-vcvtop', r'exec-vcvtop', r'op-relaxed_trunc_s'), + Instruction(r'\I32X4.\RELAXEDTRUNC\K{\_f64x2\_u}', r'\hex{FD}~~\hex{84}~~\hex{02}', r'[\V128] \to [\V128]', r'valid-vcvtop', r'exec-vcvtop', r'op-relaxed_trunc_u'), + Instruction(r'\F32X4.\RELAXEDMADD', r'\hex{FD}~~\hex{85}~~\hex{02}', r'[\V128~\V128~\V128] \to [\V128]', r'valid-vternop', r'exec-vternop', r'op-frelaxed_madd'), + Instruction(r'\F32X4.\RELAXEDNMADD', r'\hex{FD}~~\hex{86}~~\hex{02}', r'[\V128~\V128~\V128] \to [\V128]', r'valid-vternop', r'exec-vternop', r'op-frelaxed_nmadd'), + Instruction(r'\F64X2.\RELAXEDMADD', r'\hex{FD}~~\hex{87}~~\hex{02}', r'[\V128~\V128~\V128] \to [\V128]', r'valid-vternop', r'exec-vternop', r'op-frelaxed_madd'), + Instruction(r'\F64X2.\RELAXEDNMADD', r'\hex{FD}~~\hex{88}~~\hex{02}', r'[\V128~\V128~\V128] \to [\V128]', r'valid-vternop', r'exec-vternop', r'op-frelaxed_nmadd'), + Instruction(r'\I8X16.\RELAXEDLANESELECT', r'\hex{FD}~~\hex{89}~~\hex{02}', r'[\V128~\V128~\V128] \to [\V128]', r'valid-relaxed_laneselect', r'exec-relaxed_laneselect', r'op-irelaxed_laneselect'), + Instruction(r'\I16X8.\RELAXEDLANESELECT', r'\hex{FD}~~\hex{8A}~~\hex{02}', r'[\V128~\V128~\V128] \to [\V128]', r'valid-relaxed_laneselect', r'exec-relaxed_laneselect', r'op-irelaxed_laneselect'), + Instruction(r'\I32X4.\RELAXEDLANESELECT', r'\hex{FD}~~\hex{8B}~~\hex{02}', r'[\V128~\V128~\V128] \to [\V128]', r'valid-relaxed_laneselect', r'exec-relaxed_laneselect', r'op-irelaxed_laneselect'), + Instruction(r'\I64X2.\RELAXEDLANESELECT', r'\hex{FD}~~\hex{8C}~~\hex{02}', r'[\V128~\V128~\V128] \to [\V128]', r'valid-relaxed_laneselect', r'exec-relaxed_laneselect', r'op-irelaxed_laneselect'), + Instruction(r'\F32X4.\RELAXEDMIN', r'\hex{FD}~~\hex{8D}~~\hex{02}', r'[\V128~\V128] \to [\V128]', r'valid-vbinop', r'exec-vbinop', r'op-frelaxed_min'), + Instruction(r'\F32X4.\RELAXEDMAX', r'\hex{FD}~~\hex{8E}~~\hex{02}', r'[\V128~\V128] \to [\V128]', r'valid-vbinop', r'exec-vbinop', r'op-frelaxed_max'), + Instruction(r'\F64X2.\RELAXEDMIN', r'\hex{FD}~~\hex{8F}~~\hex{02}', r'[\V128~\V128] \to [\V128]', r'valid-vbinop', r'exec-vbinop', r'op-frelaxed_min'), + Instruction(r'\F64X2.\RELAXEDMAX', r'\hex{FD}~~\hex{90}~~\hex{02}', r'[\V128~\V128] \to [\V128]', r'valid-vbinop', r'exec-vbinop', r'op-frelaxed_max'), + Instruction(r'\I16X8.\RELAXEDQ15MULRS', r'\hex{FD}~~\hex{91}~~\hex{02}', r'[\V128~\V128] \to [\V128]', r'valid-vbinop', r'exec-vbinop', r'op-irelaxed_q15mulr_s'), + Instruction(r'\I16X8.\RELAXEDDOT\K{\_i8x16\_i7x16\_s}', r'\hex{FD}~~\hex{92}~~\hex{02}', r'[\V128~\V128] \to [\V128]', r'valid-relaxed_dot', r'exec-relaxed_dot'), + Instruction(r'\I32X4.\RELAXEDDOT\K{\_i8x16\_i7x16\_add\_s}', r'\hex{FD}~~\hex{93}~~\hex{02}', r'[\V128~\V128~\V128] \to [\V128]', r'valid-relaxed_dot', r'exec-relaxed_dot'), Instruction(None, r'\hex{FE}'), Instruction(None, r'\hex{FF}'), ] diff --git a/document/core/appendix/index.rst b/document/core/appendix/index.rst index 9054e64e3..d3d724709 100644 --- a/document/core/appendix/index.rst +++ b/document/core/appendix/index.rst @@ -7,6 +7,7 @@ Appendix :maxdepth: 2 embedding + profiles implementation properties algorithm diff --git a/document/core/appendix/profiles.rst b/document/core/appendix/profiles.rst new file mode 100644 index 000000000..ea32797d9 --- /dev/null +++ b/document/core/appendix/profiles.rst @@ -0,0 +1,131 @@ +.. index:: ! profile, abstract syntax, validation, execution, binary format, text format +.. _profiles: + +Profiles +-------- + +To enable the use of WebAssembly in as many environments as possible, *profiles* specify coherent language subsets that fit constraints imposed by common classes of host environments. +A host platform can thereby decide to support the language only under a restricted profile, or even the intersection of multiple profiles. + + +Conventions +~~~~~~~~~~~ + +A profile modification is specified by decorating selected rules in the main body of this specification with a *profile annotation* that defines them as conditional on the choice of profile. + +For that purpose, every profile defines a *profile marker*, an alphanumeric short-hand like :math:`\profilename{ABC}`. +A profile annotation of the form :math:`\exprofiles{\profile{ABC}~\profile{XYZ}}` on a rule indicates that this rule is *excluded* for either of the profiles whose marker is :math:`\profilename{ABC}` or :math:`\profilename{XYZ}`. + +There are two ways of subsetting the language in a profile: + +* *Syntactic*, by *omitting* a feature, in which case certain constructs are removed from the syntax altogether. + +* *Semantic*, by *restricting* a feature, in which case certain constructs are still present but some behaviours are ruled out. + + +Syntax Annotations +.................. + +To omit a construct from a profile syntactically, respective productions in the grammar of the :ref:`abstract syntax ` are annotated with an associated profile marker. +This is defined to have the following implications: + +1. Any production in the :ref:`binary ` or :ref:`textual ` syntax that produces abstract syntax with a marked construct is omitted by extension. + +2. Any :ref:`validation ` or :ref:`execution ` rule that handles a marked construct is omitted by extension. + +The overall effect is that the respective construct is no longer part of the language under a respective profile. + +.. note:: + For example, a "busy" profile marked :math:`\profilename{BUSY}` could rule out the |NOP| instruction by marking the production for it in the abstract syntax as follows: + + .. math:: + \begin{array}{llcl} + \production{instruction} & \instr &::=& + \dots \\ + & \exprofiles{\profile{BUSY}} &|& \NOP \\ + & &|& \UNREACHABLE \\ + \end{array} + + A rule may be annotated by multiple markers, which could be the case if a construct is in the intersection of multiple features. + + +Semantics Annotations +..................... + +To restrict certain behaviours in a profile, individual :ref:`validation ` or :ref:`reduction ` rules or auxiliary definitions are annotated with an associated marker. + +This has the consequence that the respective rule is no longer applicable under the given profile. + +.. note:: + For example, an "infinite" profile marked :math:`\profilename{INF}` could define that growing memory never fails: + + .. math:: + \begin{array}{llcl@{\qquad}l} + & S; F; (\I32.\CONST~n)~\MEMORYGROW~x &\stepto& S'; F; (\I32.\CONST~\X{sz}) + \\&&& + \begin{array}[t]{@{}r@{~}l@{}} + (\iff & F.\AMODULE.\MIMEMS[x] = a \\ + \wedge & \X{sz} = |S.\SMEMS[a].\MIDATA|/64\,\F{Ki} \\ + \wedge & S' = S \with \SMEMS[a] = \growmem(S.\SMEMS[a], n)) \\[1ex] + \end{array} + \\[1ex] + \exprofiles{\profile{INF}} & S; F; (\I32.\CONST~n)~\MEMORYGROW~x &\stepto& S; F; (\I32.\CONST~\signed_{32}^{-1}(-1)) + \end{array} + + +Properties +.......... + +All profiles are defined such that the following properties are preserved: + +* All profiles represent syntactic and semantic subsets of the :ref:`full profile `, i.e., they do not *add* syntax or *alter* behaviour. + +* All profiles are mutually compatible, i.e., no two profiles subset semantic behaviour in inconsistent or ambiguous ways, and any intersection of profiles preserves the properties described here. + +* Profiles do not violate :ref:`soundness `, i.e., all :ref:`configurations ` valid under that profile still have well-defined execution behaviour. + +.. note:: + Tools are generally expected to handle and produce code for the full profile by default. + In particular, producers should not generate code that *depends* on specific profiles. Instead, all code should preserve correctness when executed under the full profile. + + Moreover, profiles should be considered static and fixed for a given platform or ecosystem. Runtime conditioning on the "current" profile is not intended and should be avoided. + + + +Defined Profiles +~~~~~~~~~~~~~~~~ + +.. note:: + The number of defined profiles is expected to remain small in the future. Profiles are intended for broad and permanent use cases only. In particular, profiles are not intended for language versioning. + + +.. index:: full profile + single: profile; full +.. _profile-full: + +Full Profile (:math:`{\small{\mathrm{FUL}}}`) +............................................. + +The *full* profile contains the complete language and all possible behaviours. +It imposes no restrictions, i.e., all rules and definitions are active. +All other profiles define sub-languages of this profile. + + +.. index:: determinism, non-determinism, deterministic profile + single: profile; deterministic +.. _profile-deterministic: + +Deterministic Profile (:math:`{\small{\mathrm{DET}}}`) +...................................................... + +The *deterministic* profile excludes all rules marked :math:`\exprofiles{\PROFDET}`. +It defines a sub-language that does not exhibit any incidental non-deterministic behaviour: + +* All :ref:`NaN ` values :ref:`generated ` by :ref:`floating-point instructions ` are canonical and positive. + +* All :ref:`relaxed vector instructions ` have a fixed behaviour that does not depend on the implementation. + +Even under this profile, the |MEMORYGROW| and |TABLEGROW| instructions technically remain :ref:`non-deterministic `, in order to be able to indicate resource exhaustion. + +.. note:: + In future versions of WebAssembly, new non-deterministic behaviour may be added to the language, such that the deterministic profile will induce additional restrictions. diff --git a/document/core/binary/instructions.rst b/document/core/binary/instructions.rst index 3a86f5253..af31e72eb 100644 --- a/document/core/binary/instructions.rst +++ b/document/core/binary/instructions.rst @@ -931,6 +931,31 @@ All other vector instructions are plain opcodes without any immediates. \hex{FD}~~95{:}\Bu32 &\Rightarrow& \F64X2.\VPROMOTE\K{\_low\_f32x4}\\ \end{array} +.. math:: + \begin{array}{llclll} + \phantom{\production{instruction}} & \phantom{\Binstr} &\phantom{::=}& \phantom{\dots} && \phantom{vechaslongerinstructionnames} \\[-2ex] &&|& + \hex{FD}~~256{:}\Bu32 &\Rightarrow& \I16X8.\RELAXEDSWIZZLE \\ &&|& + \hex{FD}~~257{:}\Bu32 &\Rightarrow& \I32X4.\RELAXEDTRUNC\K{\_f32x4\_s} \\ &&|& + \hex{FD}~~258{:}\Bu32 &\Rightarrow& \I32X4.\RELAXEDTRUNC\K{\_f32x4\_u} \\ &&|& + \hex{FD}~~259{:}\Bu32 &\Rightarrow& \I32X4.\RELAXEDTRUNC\K{\_f32x4\_s\_zero} \\ &&|& + \hex{FD}~~260{:}\Bu32 &\Rightarrow& \I32X4.\RELAXEDTRUNC\K{\_f32x4\_u\_zero} \\ &&|& + \hex{FD}~~261{:}\Bu32 &\Rightarrow& \F32X4.\RELAXEDMADD \\ &&|& + \hex{FD}~~262{:}\Bu32 &\Rightarrow& \F32X4.\RELAXEDNMADD \\ &&|& + \hex{FD}~~263{:}\Bu32 &\Rightarrow& \F64X2.\RELAXEDMADD \\ &&|& + \hex{FD}~~264{:}\Bu32 &\Rightarrow& \F64X2.\RELAXEDNMADD \\ &&|& + \hex{FD}~~265{:}\Bu32 &\Rightarrow& \I8X16.\RELAXEDLANESELECT \\ &&|& + \hex{FD}~~266{:}\Bu32 &\Rightarrow& \I16X8.\RELAXEDLANESELECT \\ &&|& + \hex{FD}~~267{:}\Bu32 &\Rightarrow& \I32X4.\RELAXEDLANESELECT \\ &&|& + \hex{FD}~~268{:}\Bu32 &\Rightarrow& \I64X2.\RELAXEDLANESELECT \\ &&|& + \hex{FD}~~269{:}\Bu32 &\Rightarrow& \F32X4.\RELAXEDMIN \\ &&|& + \hex{FD}~~270{:}\Bu32 &\Rightarrow& \F32X4.\RELAXEDMAX \\ &&|& + \hex{FD}~~271{:}\Bu32 &\Rightarrow& \F64X2.\RELAXEDMIN \\ &&|& + \hex{FD}~~272{:}\Bu32 &\Rightarrow& \F64X2.\RELAXEDMAX \\ &&|& + \hex{FD}~~273{:}\Bu32 &\Rightarrow& \I16X8.\RELAXEDQ15MULRS \\ &&|& + \hex{FD}~~274{:}\Bu32 &\Rightarrow& \I16X8.\RELAXEDDOT\K{\_i8x16\_i7x16\_s} \\ &&|& + \hex{FD}~~275{:}\Bu32 &\Rightarrow& \I16X8.\RELAXEDDOT\K{\_i8x16\_i7x16\_add\_s} \\ + \end{array} + .. index:: expression pair: binary format; expression diff --git a/document/core/exec/instructions.rst b/document/core/exec/instructions.rst index aac9f7609..2b8cc7494 100644 --- a/document/core/exec/instructions.rst +++ b/document/core/exec/instructions.rst @@ -7,7 +7,7 @@ Instructions WebAssembly computation is performed by executing individual :ref:`instructions `. -.. index:: numeric instruction, determinism, trap, NaN, value, value type +.. index:: numeric instruction, determinism, non-determinism, trap, NaN, value, value type pair: execution; instruction single: abstract syntax; instruction .. _exec-instr-numeric: @@ -1601,6 +1601,21 @@ Most other vector instructions are defined in terms of numeric operators that ar :math:`\lanes_{\K{i32x4}}(v_1)` and :math:`\lanes_{\K{i32x4}}(v_2)` respectively. +For non-deterministic operators this definition is generalized to sets: + +.. math:: + \begin{array}{lll} + \X{op}_{t\K{x}N}(n_1,\dots,n_k) &=& + \{ \lanes^{-1}_{t\K{x}N}(i^\ast) ~|~ i^\ast \in \Large\times\xref{Step_pure/instructions}{exec-instr-numeric}{\X{op}}_t(i_1,\dots,i_k)^\ast \land i_1^\ast = \lanes_{t\K{x}N}(n_1) \land \dots \land i_k^\ast = \lanes_{t\K{x}N}(n_k) \} \\ + \end{array} + +where :math:`\Large\times \{x^\ast\}^N` transforms a sequence of :math:`N` sets of values into a set of sequences of :math:`N` values by computing the set product: + +.. math:: + \begin{array}{lll} + \Large\times (S_1 \dots S_N) &=& \{ x_1 \dots x_N ~|~ x_1 \in S_1 \land \dots \land x_N \in S_N \} + \end{array} + .. _exec-vconst: @@ -1735,6 +1750,35 @@ Most other vector instructions are defined in terms of numeric operators that ar \end{array} +.. _exec-relaxed_swizzle: + +:math:`\K{i8x16.}\RELAXEDSWIZZLE` +................................. + +.. todo: align the implementation of swizzle and relaxed_swizzle + +1. Assert: due to :ref:`validation `, two values of :ref:`value type ` |V128| are on the top of the stack. + +2. Pop the value :math:`\V128.\VCONST~c_2` from the stack. + +3. Pop the value :math:`\V128.\VCONST~c_1` from the stack. + +4. Let :math:`c'` be the result of computing :math:`\lanes^{-1}_{\I8X16}(\irelaxedswizzle(\lanes_{\I8X16}(c_1), \lanes_{\I8X16}(c_2)))`. + +5. Push the value :math:`\V128.\VCONST~c'` onto the stack. + +.. math:: + \begin{array}{l} + \begin{array}{lcl@{\qquad}l} + (\V128\K{.}\VCONST~c_1)~(\V128\K{.}\VCONST~c_2)~\I8X16\K{.}\irelaxedswizzle &\stepto& (\V128\K{.}\VCONST~c') + \end{array} + \\ \qquad + \begin{array}[t]{@{}r@{~}l@{}} + (\iff & c' = \lanes^{-1}_{\I8X16}(\irelaxedswizzle(\lanes_{\I8X16}(c_1), \lanes_{\I8X16}(c_2))) + \end{array} + \end{array} + + .. _exec-vec-shuffle: :math:`\K{i8x16.}\SHUFFLE~x^\ast` @@ -1915,6 +1959,66 @@ Most other vector instructions are defined in terms of numeric operators that ar \end{array} +.. _exec-vternop: + +:math:`\shape\K{.}\vternop` +........................... + +1. Assert: due to :ref:`validation `, three values of :ref:`value type ` |V128| are on the top of the stack. + +2. Pop the value :math:`\V128.\VCONST~c_3` from the stack. + +3. Pop the value :math:`\V128.\VCONST~c_2` from the stack. + +4. Pop the value :math:`\V128.\VCONST~c_1` from the stack. + +5. Let :math:`c` be the result of computing :math:`\vternop_{\shape}(c_1, c_2, c_3)`. + +6. Push the value :math:`\V128.\VCONST~c` to the stack. + +.. math:: + \begin{array}{l} + \begin{array}{lcl@{\qquad}l} + (\V128\K{.}\VCONST~c_1)~(\V128\K{.}\VCONST~c_2)~(\V128\K{.}\VCONST~c_3)~\V128\K{.}\vternop &\stepto& (\V128\K{.}\VCONST~c) & + \end{array} + \\ \qquad + \begin{array}[t]{@{}r@{~}l@{}} + (\iff c = \vternop_{\shape}(c_1, c_2, c_3)) \\ + \end{array} + \end{array} + + +.. _exec-relaxed_laneselect: + +:math:`t\K{x}N\K{.}\RELAXEDLANESELECT` +...................................... + +1. Assert: due to :ref:`validation `, three values of :ref:`value type ` |V128| are on the top of the stack. + +2. Pop the value :math:`\V128.\VCONST~c_3` from the stack. + +3. Pop the value :math:`\V128.\VCONST~c_2` from the stack. + +4. Pop the value :math:`\V128.\VCONST~c_1` from the stack. + +5. Let :math:`N` be the :ref:`bit width ` :math:`|t|` of :ref:`value type ` :math:`t`. + +6. Let :math:`c` be the result of computing :math:`\irelaxedlaneselect_{t\K{x}N}(c_1, c_2, c_3)`. + +7. Push the value :math:`\V128.\VCONST~c` to the stack. + +.. math:: + \begin{array}{l} + \begin{array}{lcl@{\qquad}l} + (\V128\K{.}\VCONST~c_1)~(\V128\K{.}\VCONST~c_2)~(\V128\K{.}\VCONST~c_3)~\V128\K{.}\RELAXEDLANESELECT &\stepto& (\V128\K{.}\VCONST~c) & \\ + \end{array} + \\ \qquad + \begin{array}[t]{@{}r@{~}l@{}} + (\iff c = \irelaxedlaneselect_{t\K{x}N}(c_1, c_2, c_3)^\ast \\ + \end{array} + \end{array} + + .. _exec-vrelop: :math:`t\K{x}N\K{.}\vrelop` @@ -2232,6 +2336,86 @@ where: \end{array} +.. _exec-relaxed_dot: + + +:math:`\K{i16x8.}\RELAXEDDOT\K{\_i8x16\_i7x16\_s}` +.................................................. + +.. todo: move more of this to numerics + +1. Assert: due to :ref:`validation `, two values of :ref:`value type ` |V128| are on the top of the stack. + +2. Pop the value :math:`\V128.\VCONST~c_2` from the stack. + +3. Pop the value :math:`\V128.\VCONST~c_1` from the stack. + +4. Let :math:`(i_1~i_2)^8` be the result of computing :math:`\irelaxeddotmul_{8,16}(\lanes_{\I8X16}(c_1), \lanes_{\I8X16}(c_2))` + +5. Let :math:`j^8` be the result of computing :math:`\iaddsats_{16}(i_1, i_2)^8`. + +6. Let :math:`c` be the result of computing :math:`\lanes^{-1}_{\I16X8}(j^8)`. + +7. Push the value :math:`\V128.\VCONST~c` onto the stack. + +.. math:: + \begin{array}{l} + \begin{array}{llcl@{\qquad}l} + & (\V128\K{.}\VCONST~c_1)~(\V128\K{.}\VCONST~c_2)~\K{i16x8.}\RELAXEDDOT\K{\_i8x16\_i7x16\_s} &\stepto& (\V128\K{.}\VCONST~c) \\ + \end{array} + \\ \qquad + \begin{array}[t]{@{}r@{~}l@{}} + (\iff & (i_1~i_2)^8 = \irelaxeddotmul_{8,16}(\lanes_{\I8X16}(c_1), \lanes_{\I8X16}(c_2)) \\ + \wedge & j^8 = \iaddsats_{16}(i_1, i_2)^8 \\ + \wedge & c = \lanes^{-1}_{\I16X8}(j^8)) + \end{array} + \end{array} + + +:math:`\K{i32x4.}\RELAXEDDOT\K{\_i8x16\_i7x16\_add\_s}` +....................................................... + +.. todo: move more of this to numerics + +1. Assert: due to :ref:`validation `, three values of :ref:`value type ` |V128| are on the top of the stack. + +2. Pop the value :math:`\V128.\VCONST~c_3` from the stack. + +3. Pop the value :math:`\V128.\VCONST~c_2` from the stack. + +4. Pop the value :math:`\V128.\VCONST~c_1` from the stack. + +5. Let :math:`(i_1~i_2)^8` be the result of computing :math:`\irelaxeddotmul_{8,16}(\lanes_{\I8X16}(c_1), \lanes_{\I8X16}(c_2))` + +6. Let :math:`(j_1~j_2)^4` be the result of computing :math:`\iaddsats_{16}(i_1, i_2)^8`. + +7. Let :math:`j^4` be the result of computing :math:`\iadd_{32}(\extends_{16,32}(j_1), \extends_{16,32}(j_2))^4`. + +8. Let :math:`k^4` be the result of computing :math:`\lanes_{\I32X4}(c_3)`. + +9. Let :math:`l^4` be the result of computing :math:`\iadd_{32}(j, k)^4`. + +10. Let :math:`c` be the result of computing :math:`\lanes^{-1}_{\I32X4}(l^4)`. + +11. Push the value :math:`\V128.\VCONST~c` onto the stack. + +.. math:: + \begin{array}{l} + \begin{array}{llcl@{\qquad}l} + & (\V128\K{.}\VCONST~c_1)~(\V128\K{.}\VCONST~c_2)~(\V128\K{.}\VCONST~c_3)~\K{i32x4.}\RELAXEDDOT\K{\_i8x16\_i7x16\_add\_s} &\stepto& (\V128\K{.}\VCONST~c) \\ + \end{array} + \\ \qquad + \begin{array}[t]{@{}r@{~}l@{}} + (\iff & (i_1~i_2)^8 = \irelaxeddotmul_{8,16}(\lanes_{\I8X16}(c_1), \lanes_{\I8X16}(c_2)) \\ + \wedge & (j_1~j_2)^4 = \iaddsats_{16}(i_1, i_2)^8 \\ + \wedge & j^4 = \iadd_{32}(\extends_{16,32}(j_1), \extends_{16,32}(j_2))^4 \\ + \wedge & k^4 = \lanes_{\I32X4}(c_3) \\ + \wedge & l^4 = \iadd_{32}(j, k)^4 \\ + \wedge & c = \lanes^{-1}_{\I32X4}(l^4)) + \end{array} + \end{array} + + .. _exec-vec-extmul: :math:`t_2\K{x}N\K{.}\EXTMUL\K{\_}\half\K{\_}t_1\K{x}M\K{\_}\sx` @@ -2649,6 +2833,7 @@ Table Instructions \end{array} +.. index:: determinism, non-determinism .. _exec-table.grow: :math:`\TABLEGROW~x` @@ -3553,6 +3738,7 @@ Memory Instructions \end{array} +.. index:: determinism, non-determinism .. _exec-memory.grow: :math:`\MEMORYGROW~x` diff --git a/document/core/exec/numerics.rst b/document/core/exec/numerics.rst index 093258e0a..44f823eef 100644 --- a/document/core/exec/numerics.rst +++ b/document/core/exec/numerics.rst @@ -1,4 +1,4 @@ -.. index:: value, integer, floating-point, bit width, determinism, NaN +.. index:: value, integer, floating-point, bit width, determinism, non-determinism, NaN .. _exec-op-partial: .. _exec-numeric: @@ -88,8 +88,8 @@ Conventions: .. math:: \begin{array}{lll@{\qquad}l} - \satu_N(i) &=& 2^N-1 & (\iff i > 2^N-1)\\ \satu_N(i) &=& 0 & (\iff i < 0) \\ + \satu_N(i) &=& 2^N-1 & (\iff i > 2^N-1)\\ \satu_N(i) &=& i & (\otherwise) \\ \end{array} @@ -97,8 +97,8 @@ Conventions: .. math:: \begin{array}{lll@{\qquad}l} - \sats_N(i) &=& \signed_N^{-1}(-2^{N-1}) & (\iff i < -2^{N-1})\\ - \sats_N(i) &=& \signed_N^{-1}(2^{N-1}-1) & (\iff i > 2^{N-1}-1)\\ + \sats_N(i) &=& -2^{N-1} & (\iff i < -2^{N-1})\\ + \sats_N(i) &=& 2^{N-1}-1 & (\iff i > 2^{N-1}-1)\\ \sats_N(i) &=& i & (\otherwise) \end{array} @@ -860,11 +860,11 @@ The integer result of predicates -- i.e., :ref:`tests ` and :ref: * Let :math:`j` be the result of adding :math:`j_1` and :math:`j_2`. -* Return :math:`\sats_N(j)`. +* Return the value whose signed interpretation is :math:`\sats_N(j)`. .. math:: \begin{array}{lll@{\qquad}l} - \iaddsats_N(i_1, i_2) &=& \sats_N(\signed_N(i_1) + \signed_N(i_2)) + \iaddsats_N(i_1, i_2) &=& \signed_N^{-1}(\sats_N(\signed_N(i_1) + \signed_N(i_2))) \end{array} @@ -894,11 +894,11 @@ The integer result of predicates -- i.e., :ref:`tests ` and :ref: * Let :math:`j` be the result of subtracting :math:`j_2` from :math:`j_1`. -* Return :math:`\sats_N(j)`. +* Return the value whose signed interpretation is :math:`\sats_N(j)`. .. math:: \begin{array}{lll@{\qquad}l} - \isubsats_N(i_1, i_2) &=& \sats_N(\signed_N(i_1) - \signed_N(i_2)) + \isubsats_N(i_1, i_2) &=& \signed_N^{-1}(\sats_N(\signed_N(i_1) - \signed_N(i_2))) \end{array} @@ -922,11 +922,11 @@ The integer result of predicates -- i.e., :ref:`tests ` and :ref: :math:`\iq15mulrsats_N(i_1, i_2)` ................................. -* Return the result of :math:`\sats_N(\ishrs_N(i_1 \cdot i_2 + 2^{14}, 15))`. +* Return the whose signed interpretation is the result of :math:`\sats_N(\ishrs_N(i_1 \cdot i_2 + 2^{14}, 15))`. .. math:: \begin{array}{lll@{\qquad}l} - \iq15mulrsats_N(i_1, i_2) &=& \sats_N(\ishrs_N(i_1 \cdot i_2 + 2^{14}, 15)) + \iq15mulrsats_N(i_1, i_2) &=& \signed_N^{-1}(\sats_N(\ishrs_N(i_1 \cdot i_2 + 2^{14}, 15))) \end{array} @@ -1033,7 +1033,7 @@ where: \end{array} -.. index:: NaN +.. index:: NaN, determinism, non-determinism .. _aux-nans: NaN Propagation @@ -1046,14 +1046,17 @@ then its sign is non-deterministic and the :ref:`payload ` is co * Otherwise the payload is picked non-deterministically among all :ref:`arithmetic NaNs `; that is, its most significant bit is :math:`1` and all others are unspecified. -This non-deterministic result is expressed by the following auxiliary function producing a set of allowed outputs from a set of inputs: +* In the :ref:`deterministic profile `, however, a positive canonical NaNs is reliably produced in the latter case. + +The non-deterministic result is expressed by the following auxiliary function producing a set of allowed outputs from a set of inputs: .. math:: - \begin{array}{lll@{\qquad}l} - \nans_N\{z^\ast\} &=& \{ + \NAN(n), - \NAN(n) ~|~ n = \canon_N \} - & (\iff \forall \NAN(n) \in z^\ast,~ n = \canon_N) \\ - \nans_N\{z^\ast\} &=& \{ + \NAN(n), - \NAN(n) ~|~ n \geq \canon_N \} - & (\otherwise) \\ + \begin{array}{llcl@{\qquad}l} + & \nans_N\{z^\ast\} &=& \{ + \NAN(\canon_N) \} \\ + \exprofiles{\PROFDET} & \nans_N\{z^\ast\} &=& \{ + \NAN(n), - \NAN(n) ~|~ n = \canon_N \} + & (\iff \{z^\ast\} \subseteq \{ + \NAN(\canon_N), - \NAN(\canon_N) \} \\ + \exprofiles{\PROFDET} & \nans_N\{z^\ast\} &=& \{ + \NAN(n), - \NAN(n) ~|~ n \geq \canon_N \} + & (\iff \{z^\ast\} \not\subseteq \{ + \NAN(\canon_N), - \NAN(\canon_N) \} \\ \end{array} @@ -1236,6 +1239,55 @@ This non-deterministic result is expressed by the following auxiliary function p \end{array} +.. _op-fma: + +:math:`\fma_N(z_1, z_2, z_3)` +............................. + +The function :math:`\fma` is the same as *fusedMultiplyAdd* defined by |IEEE754|_ (Section 5.4.1). +It computes :math:`(z_1 \cdot z_2) + z_3` as if with unbounded range and precision, rounding only once for the final result. + +* If either :math:`z_1` or :math:`z_2` or :math:`z_3` is a NaN, return an element of :math:`\nans_N{z_1, z_2, z_3}`. + +* Else if either :math:`z_1` or :math:`z_2` is a zero and the other is an infinity, then return an element of :math:`\nans_N\{\}`. + +* Else if both :math:`z_1` or :math:`z_2` are infinities of equal sign, and :math:`z_3` is a negative infinity, then return an element of :math:`\nans_N\{\}`. + +* Else if both :math:`z_1` or :math:`z_2` are infinities of opposite sign, and :math:`z_3` is a positive infinity, then return an element of :math:`\nans_N\{\}`. + +* Else if either :math:`z_1` or :math:`z_2` is an infinity and the other is a value of the same sign, and :math:`z_3` is a negative infinity, then return an element of :math:`\nans_N\{\}`. + +* Else if either :math:`z_1` or :math:`z_2` is an infinity and the other is a value of the opposite sign, and :math:`z_3` is a positive infinity, then return an element of :math:`\nans_N\{\}`. + +* Else if both :math:`z_1` and :math:`z_2` are zeroes of the same sign and :math:`z_3` is a zero, then return positive zero. + +* Else if both :math:`z_1` and :math:`z_2` are zeroes of the opposite sign and :math:`z_3` is a positive zero, then return positive zero. + +* Else if both :math:`z_1` and :math:`z_2` are zeroes of the opposite sign and :math:`z_3` is a negative zero, then return negative zero. + +* Else return the result of multiplying :math:`z_1` and :math:`z_2`, adding :math:`z_3` to the intermediate, and the final result ref:`rounded ` to the nearest representable value. + +.. math:: + \begin{array}{@{}llcll} + & \fma_N(\pm \NAN(n), z_2, z_3) &=& \nans_N\{\pm \NAN(n), z_2, z_3\} \\ + & \fma_N(z_1, \pm \NAN(n), z_3) &=& \nans_N\{\pm \NAN(n), z_1, z_3\} \\ + & \fma_N(z_1, z_2, \pm \NAN(n)) &=& \nans_N\{\pm \NAN(n), z_1, z_2\} \\ + & \fma_N(\pm \infty, \pm 0, z_3) &=& \nans_N\{\} \\ + & \fma_N(\pm \infty, \mp 0, z_3) &=& \nans_N\{\} \\ + & \fma_N(\pm \infty, \pm \infty, - \infty) &=& \nans_N\{\} \\ + & \fma_N(\pm \infty, \mp \infty, + \infty) &=& \nans_N\{\} \\ + & \fma_N(\pm q_1, \pm \infty, - \infty) &=& \nans_N\{\} \\ + & \fma_N(\pm q_1, \mp \infty, + \infty) &=& \nans_N\{\} \\ + & \fma_N(\pm \infty, \pm q_1, - \infty) &=& \nans_N\{\} \\ + & \fma_N(\mp \infty, \pm q_1, + \infty) &=& \nans_N\{\} \\ + & \fma_N(\pm 0, \pm 0, \mp 0) &=& + 0 \\ + & \fma_N(\pm 0, \pm 0, \pm 0) &=& + 0 \\ + & \fma_N(\pm 0, \mp 0, + 0) &=& + 0 \\ + & \fma_N(\pm 0, \mp 0, - 0) &=& - 0 \\ + & \fma_N(z_1, z_2, z_3) &=& \ieee_N(z_1 \cdot z_2 + z_3) \\ + \end{array} + + .. _op-fmin: :math:`\fmin_N(z_1, z_2)` @@ -1849,14 +1901,14 @@ Conversions * Else if :math:`z` is positive infinity, then return :math:`2^{N-1} - 1`. -* Else, return :math:`\sats_N(\trunc(z))`. +* Else, return the value whose signed interpretation is :math:`\sats_N(\trunc(z))`. .. math:: \begin{array}{lll@{\qquad}l} \truncsats_{M,N}(\pm \NAN(n)) &=& 0 \\ \truncsats_{M,N}(- \infty) &=& -2^{N-1} \\ \truncsats_{M,N}(+ \infty) &=& 2^{N-1}-1 \\ - \truncsats_{M,N}(z) &=& \sats_N(\trunc(z)) \\ + \truncsats_{M,N}(z) &=& \signed_N^{-1}(\sats_N(\trunc(z))) \\ \end{array} @@ -1954,11 +2006,11 @@ Conversions * Let :math:`j` be the :ref:`signed interpretation ` of :math:`i` of size :math:`M`. -* Return :math:`\sats_N(j)`. +* Return the value whose signed interpretation is :math:`\sats_N(j)`. .. math:: \begin{array}{lll@{\qquad}l} - \narrows_{M,N}(i) &=& \sats_N(\signed_M(i)) + \narrows_{M,N}(i) &=& \signed_N^{-1}(\sats_N(\signed_M(i))) \end{array} @@ -1975,3 +2027,280 @@ Conversions \begin{array}{lll@{\qquad}l} \narrowu_{M,N}(i) &=& \satu_N(\signed_M(i)) \end{array} + + +.. _relaxed-ops: +.. _aux-relaxed: + +Relaxed Operations +~~~~~~~~~~~~~~~~~~ + +The result of *relaxed* operators are *implementation-dependent*, because the set of possible results may depend on properties of the host environment, such as its hardware. +Technically, their behaviour is controlled by a set of *global parameters* to the semantics that an implementation can instantiate in different ways. +These choices are fixed, that is, parameters are constant during the execution of any given program. + +Every such parameter is an index into a sequence of possible sets of results and must be instantiated to a defined index. +In the :ref:`deterministic profile `, every parameter is prescribed to be 0. +This behaviour is expressed by the following auxiliary function, +where :math:`R` is a global parameter selecting one of the allowed outcomes: + +.. math:: + \begin{array}{@{}lcll} + \EXPROFDET & \relaxed(R)[ A_0, \dots, A_n ] = A_R \\ + & \relaxed(R)[ A_0, \dots, A_n ] = A_0 \\ + \end{array} + +.. note:: + Each parameter can be thought of as inducing a family of operations + that is fixed to one particular choice by an implementation. + The fixed operation itself can still be non-deterministic or partial. + + Implementations are expexted to either choose the behaviour that is the most efficient on the underlying hardware, + or the behaviour of the deterministic profile. + + +.. _op-frelaxed_madd: + +:math:`\frelaxedmadd_N(z_1, z_2, z_3)` +...................................... + +The implementation-specific behaviour of this operation is determined by the global parameter :math:`R_{\F{fmadd}} \in \{0, 1\}`. + +* Return :math:`\relaxed(R_{\F{fmadd}})[\fadd_N(\fmul_N(z_1, z_2), z_3)` or :math:`\fma_N(z_1, z_2, z_3)]`. + +.. math:: + \begin{array}{@{}lcll} + \frelaxedmadd_N(z_1, z_2, z_3) &=& \relaxed(R_{\F{fmadd}})[ \fadd_N(\fmul_N(z_1, z_2), z_3), \fma_N(z_1, z_2, z_3) ] \\ + \end{array} + +.. note:: + Relaxed multiply-add allows for fused or unfused results, + which leads to implementation-dependent rounding behaviour. + In the :ref:`deterministic profile `, + the unfused behaviour is used. + + +.. _op-frelaxed_nmadd: + +:math:`\frelaxednmadd_N(z_1, z_2, z_3)` +....................................... + +* Return :math:`\frelaxedmadd(-z_1, z_2, z_3)`. + +.. math:: + \begin{array}{@{}lcll} + \frelaxednmadd_N(z_1, z_2, z_3) &=& \frelaxedmadd_N(-z_1, z_2, z_3) \\ + \end{array} + +.. note:: + This operation is implementation-dependent because :math:`\frelaxedmadd` is implementation-dependent. + + +.. _op-frelaxed_min: + +:math:`\frelaxedmin_N(z_1, z_2)` +................................ + +The implementation-specific behaviour of this operation is determined by the global parameter :math:`R_{\F{fmin}} \in \{0, 1, 2, 3\}`. + +* If :math:`z_1` is a NaN, then return :math:`\relaxed(R_{\F{fmin}})[ \fmin_N(z_1, z_2)`, \NAN(n), z_2, z_2 ]`. + +* If :math:`z_2` is a NaN, then return :math:`\relaxed(R_{\F{fmin}})[ \fmin_N(z_1, z_2)`, z_1, \NAN(n), z_1 ]`. + +* If both :math:`z_1` and :math:`z_2` are zeroes of opposite sign, then return :math:`\relaxed(R_{\F{fmin}})[ \fmin_N(z_1, z_2)`, \pm 0, \mp 0, -0 ]`. + +* Return :math:`\fmin_N(z_1, z_2)`. + +.. math:: + \begin{array}{@{}lcll} + \frelaxedmin_N(\pm \NAN(n), z_2) &=& \relaxed(R_{\F{fmin}})[ \fmin_N(\pm \NAN(n), z_2), \NAN(n), z_2, z_2 ] \\ + \frelaxedmin_N(z_1, \pm \NAN(n)) &=& \relaxed(R_{\F{fmin}})[ \fmin_N(z_1, \pm \NAN(n)), z_1, \NAN(n), z_1 ] \\ + \frelaxedmin_N(\pm 0, \mp 0) &=& \relaxed(R_{\F{fmin}})[ \fmin_N(\pm 0, \mp 0), \pm 0, \mp 0, -0 ] \\ + \frelaxedmin_N(z_1, z_2) &=& \fmin_N(z_1, z_2) & (\otherwise) \\ + \end{array} + +.. note:: + Relaxed minimum is implementation-dependent for NaNs and for zeroes with different signs. + In the :ref:`deterministic profile `, + it behaves like regular :math:`\fmin`. + + +.. _op-frelaxed_max: + +:math:`\frelaxedmax_N(z_1, z_2)` +................................ + +The implementation-specific behaviour of this operation is determined by the global parameter :math:`R_{\F{fmax}} \in \{0, 1, 2, 3\}`. + +* If :math:`z_1` is a NaN, then return :math:`\relaxed(R_{\F{fmax}})[ \fmax_N(z_1, z_2)`, \NAN(n), z_2, z_2 ]`. + +* If :math:`z_2` is a NaN, then return :math:`\relaxed(R_{\F{fmax}})[ \fmax_N(z_1, z_2)`, z_1, \NAN(n), z_1 ]`. + +* If both :math:`z_1` and :math:`z_2` are zeroes of opposite sign, then return :math:`\relaxed(R_{\F{fmax}})[ \fmax_N(z_1, z_2)`, \pm 0, \mp 0, +0 ]`. + +* Return :math:`\fmax_N(z_1, z_2)`. + +.. math:: + \begin{array}{@{}lcll} + \frelaxedmax_N(\pm \NAN(n), z_2) &=& \relaxed(R_{\F{fmax}})[ \fmax_N(\pm \NAN(n), z_2), \NAN(n), z_2, z_2 ] \\ + \frelaxedmax_N(z_1, \pm \NAN(n)) &=& \relaxed(R_{\F{fmax}})[ \fmax_N(z_1, \pm \NAN(n)), z_1, \NAN(n), z_1 ] \\ + \frelaxedmax_N(\pm 0, \mp 0) &=& \relaxed(R_{\F{fmax}})[ \fmax_N(\pm 0, \mp 0), \pm 0, \mp 0, +0 ] \\ + \frelaxedmax_N(z_1, z_2) &=& \fmax_N(z_1, z_2) & (\otherwise) \\ + \end{array} + +.. note:: + Relaxed maximum is implementation-dependent for NaNs and for zeroes with different signs. + In the :ref:`deterministic profile `, + it behaves like regular :math:`\fmax`. + + +.. _op-irelaxed_dot_mul: + +:math:`\irelaxeddotmul_{M,N}(i_1, i_2)` +....................................... + +This is an auxiliary operator for the specification of |RELAXEDDOT|. + +The implementation-specific behaviour of this operation is determined by the global parameter :math:`R_{\F{idot}} \in \{0, 1\}`. + +* Return :math:`\relaxed(R_{\F{idot}})[ \imul_N(\extends_{M,N}(i_1), \extends_{M,N}(i_2)), \imul_N(\extends_{M,N}(i_1), \extendu_{M,N}(i_2)) ]`. + +.. math:: + \begin{array}{@{}lcll} + \irelaxeddotmul_{M,N}(i_1, i_2) &=& \relaxed(R_{\F{idot}})[ \imul_N(\extends_{M,N}(i_1), \extends_{M,N}(i_2)), \imul_N(\extends_{M,N}(i_1), \extendu_{M,N}(i_2)) ] \\ + \end{array} + +.. note:: + Relaxed dot product is implementation-dependent when the second operand is negative in a signed intepretation. + In the :ref:`deterministic profile `, + it behaves like signed dot product. + + +.. _op-irelaxed_q15mulr_s: + +:math:`\irelaxedq15mulrs_N(i_1, i_2)` +..................................... + +The implementation-specific behaviour of this operation is determined by the global parameter :math:`R_{\F{iq15mulr}} \in \{0, 1\}`. + +* If both :math:`i_1` and :math:`i_2` equal :math:`(\signed^{-1}_N(-2^{N-1})`, then return :math:`\relaxed(R_{\F{iq15mulr}})[ 2^{N-1}-1, \signed^{-1}_N(-2^{N-1}) ]`. + +* Return :math:`\iq15mulrsats(i_1, i_2)` + +.. math:: + \begin{array}{@{}lcll} + \irelaxedq15mulrs_N(\signed^{-1}_N(-2^{N-1}), \signed^{-1}_N(-2^{N-1})) &=& \relaxed(R_{\F{iq15mulr}})[ 2^{N-1}-1, \signed^{-1}_N(-2^{N-1}) ] & \\ + \irelaxedq15mulrs_N(i_1, i_2) &=& \iq15mulrsats(i_1, i_2) + \end{array} + +.. note:: + Relaxed Q15 multiplication is implementation-dependent when the result overflows. + In the :ref:`deterministic profile `, + it behaves like regular :math:`\iq15mulrsats`. + + +.. _op-relaxed_trunc: +.. _op-relaxed_trunc_u: + +:math:`\relaxedtrunc^u_{M,N}(z)` +................................ + +The implementation-specific behaviour of this operation is determined by the global parameter :math:`R_{\F{trunc\_u}} \in \{0, 1, 2, 3\}`. + +* If :math:`z` is normal or subnormal and :math:`\trunc(z)` is non-negative and less than :math:`2^N`, then return :math:`\truncu_{M,N}(z)`. + +* Else, return :math:`\relaxed(R_{\F{trunc\_u}})[ \truncsatu_{M,N}(z), 2^N-1, 2^N-2, 2^(N-1) ]`. + +.. math:: + \begin{array}{@{}lcll} + \relaxedtrunc^u_{M,N}(\pm q) &=& \truncu_{M,N}(\pm q) & (\iff 0 \leq \trunc(\pm q) < 2^N) \\ + \relaxedtrunc^u_{M,N}(z) &=& \relaxed(R_{\F{trunc\_u}})[ \truncsatu_{M,N}(z), 2^{N}-1, 2^{N}-2, 2^{N-1}] & (\otherwise) \\ + \end{array} + +.. note:: + Relaxed unsigned truncation is implementation-dependent for NaNs and out-of-range values. + In the :ref:`deterministic profile `, + it behaves like regular :math:`\truncsatu`. + + +.. _op-relaxed_trunc_s: + +:math:`\relaxedtrunc^s_{M,N}(z)` +................................ + +The implementation-specific behaviour of this operation is determined by the global parameter :math:`R_{\F{trunc\_s}} \in \{0, 1\}`. + +* If :math:`z` is normal or subnormal and :math:`\trunc(z)` is greater than or equal to :math:`-2^{N-1}` and less than :math:`2^{N-1}`, then return :math:`\truncs_{M,N}(z)`. + +* Else, return :math:`\relaxed(R_{\F{trunc\_s}})[ \truncsats_{M,N}(z), 2^N-1, 2^N-2, 2^(N-1) ]`. + +.. math:: + \begin{array}{@{}lcll} + \relaxedtrunc^s_{M,N}(\pm q) &=& \truncs_{M,N}(\pm q) & (\iff -2^{N-1} \leq \trunc(\pm q) < 2^{N-1}) \\ + \relaxedtrunc^s_{M,N}(z) &=& \relaxed(R_{\F{trunc\_s}})[ \truncsats_{M,N}(z), \signed^{-1}_N(-2^{N-1})] & (\otherwise) \\ + \end{array} + +.. note:: + Relaxed signed truncation is implementation-dependent for NaNs and out-of-range values. + In the :ref:`deterministic profile `, + it behaves like regular :math:`\truncsats`. + + +.. _op-irelaxed_swizzle: +.. _op-irelaxed_swizzle_lane: + +:math:`\irelaxedswizzle(i^n, j^n)` +.................................. + +The implementation-specific behaviour of this operation is determined by the global parameter :math:`R_{\F{swizzle}} \in \{0, 1\}`. + +* For each :math:`j_k` in :math:`j^n`, let :math:`r_k` be the value :math:`\irelaxedswizzlelane(i^n, j_k)`. + +* Let :math:`r^n` be the concatenation of all :math:`r_k`. + +* Return :math:`r^n`. + +.. math:: + \begin{array}{@{}lcl} + \irelaxedswizzle(i^n, j^n) &=& \irelaxedswizzlelane(i^n, j)^n \\ + \end{array} + +where: + +.. math:: + \begin{array}{@{}lcll} + \irelaxedswizzlelane(i^n, j) &=& i[j] & (\iff j < 16) \\ + \irelaxedswizzlelane(i^n, j) &=& 0 & (\iff \signed_8(j) < 0) \\ + \irelaxedswizzlelane(i^n, j) &=& \relaxed(R_{\F{swizzle}})[ 0, i[j \mod n] ] & (\otherwise) \\ + \end{array} + +.. note:: + Relaxed swizzle is implementation-dependent + if the signed interpretation of any of the 8-bit indices in :math:`j^n` is larger than or equal to 16. + In the :ref:`deterministic profile `, + it behaves like regular :math:`\SWIZZLE`. + + +.. _op-irelaxed_laneselect: + +:math:`\irelaxedlaneselect_N(i_1, i_2, i_3)` +............................................ + +The implementation-specific behaviour of this operation is determined by the global parameter :math:`R_{\F{laneselect}} \in \{0, 1\}`. + +* If :math:`i_3` is smaller than :math:`2^{N-1}`, then let :math:`i'_3` be the value :math:`0`, otherwise :math:`2^N-1`. + +* Let :math:`i''_3` be :math:`\relaxed(R_{\F{laneselect}})[i_3, i'_3]`. + +* Return :math:`\ibitselect_N(i_1, i_2, i''_3)`. + +.. math:: + \begin{array}{@{}lcll} + \irelaxedlaneselect_N(i_1, i_2, i_3) &=& \ibitselect_N(i_1, i_2, \relaxed(R_{\F{laneselect}})[ i_3, \extends_{1,N}(\ishru_N(i_3, N-1)) ]) \\ + \end{array} + +.. note:: + Relaxed lane selection is non-deterministic when the mask mixes set and cleared bits, + since the value of the high bit may or may not be expanded to all bits. + In the :ref:`deterministic profile `, + it behaves like :math:`\ibitselect`. diff --git a/document/core/syntax/instructions.rst b/document/core/syntax/instructions.rst index f96512ccb..615c1c6b9 100644 --- a/document/core/syntax/instructions.rst +++ b/document/core/syntax/instructions.rst @@ -190,7 +190,10 @@ Occasionally, it is convenient to group operators together according to the foll .. _syntax-visatbinop: .. _syntax-vfunop: .. _syntax-vfbinop: +.. _syntax-rvfbinop: +.. _syntax-rvfternop: .. _syntax-instr-vec: +.. _syntax-instr-relaxed: Vector Instructions ~~~~~~~~~~~~~~~~~~~ @@ -273,8 +276,16 @@ Vector instructions (also known as *SIMD* instructions, *single instruction mult \K{f32x4.}\VCONVERT\K{\_i32x4\_}\sx ~|~ \K{f32x4.}\VDEMOTE\K{\_f64x2\_zero} \\&&|& \K{f64x2.}\VCONVERT\K{\_low\_i32x4\_}\sx ~|~ - \K{f64x2.}\VPROMOTE\K{\_low\_f32x4} \\&&|& - \dots \\ + \K{f64x2.}\VPROMOTE\K{\_low\_f32x4} \\ + & &|& \K{i8x16.}\RELAXEDSWIZZLE \\ + & &|& \K{i16x8.}\RELAXEDQ15MULRS \\ + & &|& \K{i32x4.}\RELAXEDTRUNC\K{\_f32x4\_}\sx \\ + & &|& \K{i16x8.}\RELAXEDDOT\K{\_i8x16\_i7x16\_s} \\ + & &|& \K{i32x4.}\RELAXEDDOT\K{\_i8x16\_i7x16\_add\_s} \\ + & &|& \ishape\K{.}\RELAXEDLANESELECT \\ + & &|& \fshape\K{.}\rvfternop \\ + & &|& \fshape\K{.}\rvfbinop \\ + & &|& \dots \\ \end{array} .. math:: @@ -338,6 +349,12 @@ Vector instructions (also known as *SIMD* instructions, *single instruction mult \K{max} ~|~ \K{pmin} ~|~ \K{pmax} \\ + \production{relaxed vector floating-point binary operator} & \rvfbinop &::=& + \K{relaxed\_min} ~|~ + \K{relaxed\_max} \\ + \production{relaxed vector floating-point ternary operator} & \rvfternop &::=& + \K{relaxed\_madd} ~|~ + \K{relaxed\_nmadd} \\ \end{array} .. _syntax-vec-shape: @@ -382,6 +399,7 @@ For the other vector instructions, the use of two's complement for the signed in .. _syntax-vunop: .. _syntax-vbinop: .. _syntax-vrelop: +.. _syntax-vternop: .. _syntax-vtestop: .. _syntax-vcvtop: @@ -399,9 +417,14 @@ Occasionally, it is convenient to group operators together according to the foll \production{binary operator} & \vbinop &::=& \vibinop ~|~ \vfbinop \\&&|& \viminmaxop ~|~ \visatbinop \\&&|& + \rvfbinop \\&&|& \VMUL ~|~ \AVGR\K{\_u} ~|~ - \Q15MULRSAT\K{\_s} \\ + \Q15MULRSAT\K{\_s} \\&&|& + \RELAXEDQ15MULRS\K{\_s} \\ + \production{ternary operator} & \vternop &::=& + \vvternop ~|~ + \rvfternop \\ \production{test operator} & \vtestop &::=& \vitestop \\ \production{relational operator} & \vrelop &::=& @@ -411,7 +434,8 @@ Occasionally, it is convenient to group operators together according to the foll \VTRUNC\K{\_sat} ~|~ \VCONVERT ~|~ \VDEMOTE ~|~ - \VPROMOTE \\ + \VPROMOTE ~|~ + \RELAXEDTRUNC \\ \end{array} diff --git a/document/core/text/instructions.rst b/document/core/text/instructions.rst index 4c857db99..5398e0c69 100644 --- a/document/core/text/instructions.rst +++ b/document/core/text/instructions.rst @@ -1053,6 +1053,31 @@ Vector constant instructions have a mandatory :ref:`shape ` de \text{f64x2.promote\_low\_f32x4} &\Rightarrow& \F64X2.\VPROMOTE\K{\_low\_f32x4}\\ \end{array} +.. math:: + \begin{array}{llclll} + \phantom{\production{instruction}} & \phantom{\Tplaininstr_I} &\phantom{::=}& \phantom{averylonginstructionnameforvectext} && \phantom{vechasreallyreallyreallylonginstructionnames} \\[-2ex] &&|& + \text{i16x8.relaxed\_swizzle} &\Rightarrow& \I16X8.\RELAXEDSWIZZLE \\ &&|& + \text{i32x4.relaxed\_trunc\_f32x4\_s} &\Rightarrow& \I32X4.\RELAXEDTRUNC\K{\_f32x4\_s} \\ &&|& + \text{i32x4.relaxed\_trunc\_f32x4\_u} &\Rightarrow& \I32X4.\RELAXEDTRUNC\K{\_f32x4\_u} \\ &&|& + \text{i32x4.relaxed\_trunc\_f32x4\_s\_zero} &\Rightarrow& \I32X4.\RELAXEDTRUNC\K{\_f32x4\_s\_zero} \\ &&|& + \text{i32x4.relaxed\_trunc\_f32x4\_u\_zero} &\Rightarrow& \I32X4.\RELAXEDTRUNC\K{\_f32x4\_u\_zero} \\ &&|& + \text{f32x4.relaxed\_madd} &\Rightarrow& \F32X4.\RELAXEDMADD \\ &&|& + \text{f32x4.relaxed\_nmadd} &\Rightarrow& \F32X4.\RELAXEDNMADD \\ &&|& + \text{f64x2.relaxed\_madd} &\Rightarrow& \F64X2.\RELAXEDMADD \\ &&|& + \text{f64x2.relaxed\_nmadd} &\Rightarrow& \F64X2.\RELAXEDNMADD \\ &&|& + \text{i8x16.relaxed\_laneselect} &\Rightarrow& \I8X16.\RELAXEDLANESELECT \\ &&|& + \text{i16x8.relaxed\_laneselect} &\Rightarrow& \I16X8.\RELAXEDLANESELECT \\ &&|& + \text{i32x4.relaxed\_laneselect} &\Rightarrow& \I32X4.\RELAXEDLANESELECT \\ &&|& + \text{i64x2.relaxed\_laneselect} &\Rightarrow& \I64X2.\RELAXEDLANESELECT \\ &&|& + \text{f32x4.relaxed\_min} &\Rightarrow& \F32X4.\RELAXEDMIN \\ &&|& + \text{f32x4.relaxed\_max} &\Rightarrow& \F32X4.\RELAXEDMAX \\ &&|& + \text{f64x2.relaxed\_min} &\Rightarrow& \F64X2.\RELAXEDMIN \\ &&|& + \text{f64x2.relaxed\_max} &\Rightarrow& \F64X2.\RELAXEDMAX \\ &&|& + \text{i16x8.relaxed\_q15mulr\_s} &\Rightarrow& \I16X8.\RELAXEDQ15MULRS \\ &&|& + \text{i16x8.relaxed\_dot\_i8x16\_i7x16\_s} &\Rightarrow& \I16X8.\RELAXEDDOT\K{\_i8x16\_i7x16\_s} \\ &&|& + \text{i16x8.relaxed\_dot\_i8x16\_i7x16\_add\_s} &\Rightarrow& \I16X8.\RELAXEDDOT\K{\_i8x16\_i7x16\_add\_s} + \end{array} + .. index:: ! folded instruction, S-expression .. _text-foldedinstr: diff --git a/document/core/util/bikeshed_fixup.py b/document/core/util/bikeshed_fixup.py index 7041d875d..bcef6c1af 100755 --- a/document/core/util/bikeshed_fixup.py +++ b/document/core/util/bikeshed_fixup.py @@ -29,10 +29,12 @@ def Main(): number = 1 for section in [ 'Embedding', + 'Profiles', 'Implementation Limitations', 'Validation Algorithm', 'Custom Sections', 'Soundness', + 'Changes', 'Index of Types', 'Index of Instructions', 'Index of Semantic Rules']: diff --git a/document/core/util/macros.def b/document/core/util/macros.def index 85a6524d0..30516d8ef 100644 --- a/document/core/util/macros.def +++ b/document/core/util/macros.def @@ -109,6 +109,18 @@ .. |bigcompose| mathdef:: \xref{syntax/conventions}{notation-compose}{\bigoplus} +.. Profile Annotations +.. ------------------- + +.. |exprofiles#1| mathdef:: {^{\small{[!#1]}}} +.. |profilename#1| mathdef:: {\small{\mathrm{#1}}} +.. |profile| mathdef:: \mathrm + +.. |PROFFULL| mathdef:: \xref{appendix/profiles}{profile-full}{\profile{FUL}} +.. |PROFDET| mathdef:: \xref{appendix/profiles}{profile-deterministic}{\profile{DET}} +.. |EXPROFDET| mathdef:: \exprofiles{\PROFDET} + + .. Abstract Syntax .. --------------- @@ -623,6 +635,15 @@ .. |EXTADDPAIRWISE| mathdef:: \xref{syntax/instructions}{syntax-instr-vec}{\K{extadd\_pairwise}} .. |VDEMOTE| mathdef:: \xref{syntax/instructions}{syntax-instr-vec}{\K{demote}} .. |VPROMOTE| mathdef:: \xref{syntax/instructions}{syntax-instr-vec}{\K{promote}} +.. |RELAXEDMADD| mathdef:: \xref{syntax/instructions}{syntax-instr-vec}{\K{relaxed\_madd}} +.. |RELAXEDNMADD| mathdef:: \xref{syntax/instructions}{syntax-instr-vec}{\K{relaxed\_nmadd}} +.. |RELAXEDSWIZZLE| mathdef:: \xref{syntax/instructions}{syntax-instr-vec}{\K{relaxed\_swizzle}} +.. |RELAXEDLANESELECT| mathdef:: \xref{syntax/instructions}{syntax-instr-vec}{\K{relaxed\_laneselect}} +.. |RELAXEDMIN| mathdef:: \xref{syntax/instructions}{syntax-instr-vec}{\K{relaxed\_min}} +.. |RELAXEDMAX| mathdef:: \xref{syntax/instructions}{syntax-instr-vec}{\K{relaxed\_max}} +.. |RELAXEDQ15MULRS| mathdef:: \xref{syntax/instructions}{syntax-instr-vec}{\K{relaxed\_q15mulr\_s}} +.. |RELAXEDTRUNC| mathdef:: \xref{syntax/instructions}{syntax-instr-vec}{\K{relaxed\_trunc}} +.. |RELAXEDDOT| mathdef:: \xref{syntax/instructions}{syntax-instr-vec}{\K{relaxed\_dot}} .. Instructions, non-terminals @@ -655,6 +676,7 @@ .. |vunop| mathdef:: \xref{syntax/instructions}{syntax-vunop}{\X{vunop}} .. |vbinop| mathdef:: \xref{syntax/instructions}{syntax-vbinop}{\X{vbinop}} +.. |vternop| mathdef:: \xref{syntax/instructions}{syntax-vternop}{\X{vternop}} .. |vrelop| mathdef:: \xref{syntax/instructions}{syntax-vrelop}{\X{vrelop}} .. |vcvtop| mathdef:: \xref{syntax/instructions}{syntax-vcvtop}{\X{vcvtop}} @@ -674,6 +696,8 @@ .. |vfrelop| mathdef:: \xref{syntax/instructions}{syntax-vfrelop}{\X{vfrelop}} .. |vitestop| mathdef:: \xref{syntax/instructions}{syntax-vitestop}{\X{vitestop}} .. |vtestop| mathdef:: \xref{syntax/instructions}{syntax-vtestop}{\X{vtestop}} +.. |rvfbinop| mathdef:: \xref{syntax/instructions}{syntax-rvfbinop}{\X{rvfbinop}} +.. |rvfternop| mathdef:: \xref{syntax/instructions}{syntax-rvfternop}{\X{rvfternop}} .. |sx| mathdef:: \xref{syntax/instructions}{syntax-sx}{\X{sx}} .. |half| mathdef:: \xref{syntax/instructions}{syntax-half}{\X{half}} @@ -1466,6 +1490,7 @@ .. |fsub| mathdef:: \xref{exec/numerics}{op-fsub}{\F{fsub}} .. |fmul| mathdef:: \xref{exec/numerics}{op-fmul}{\F{fmul}} .. |fdiv| mathdef:: \xref{exec/numerics}{op-fdiv}{\F{fdiv}} +.. |fma| mathdef:: \xref{exec/numerics}{op-fma}{\F{fma}} .. |fmin| mathdef:: \xref{exec/numerics}{op-fmin}{\F{fmin}} .. |fmax| mathdef:: \xref{exec/numerics}{op-fmax}{\F{fmax}} .. |fcopysign| mathdef:: \xref{exec/numerics}{op-fcopysign}{\F{fcopysign}} @@ -1502,6 +1527,17 @@ .. |narrowu| mathdef:: \xref{exec/numerics}{op-narrow_u}{\F{narrow}^{\K{u}}} .. |narrows| mathdef:: \xref{exec/numerics}{op-narrow_s}{\F{narrow}^{\K{s}}} +.. |frelaxedmadd| mathdef:: \xref{exec/numerics}{op-frelaxed_madd}{\F{frelaxed\_madd}} +.. |frelaxednmadd| mathdef:: \xref{exec/numerics}{op-frelaxed_nmadd}{\F{frelaxed\_nmadd}} +.. |irelaxedswizzlelane| mathdef:: \xref{exec/numerics}{op-irelaxed_swizzle_lane}{\F{frelaxed\_swizzle\_lane}} +.. |irelaxedswizzle| mathdef:: \xref{exec/numerics}{op-irelaxed_swizzle}{\F{frelaxed\_swizzle}} +.. |relaxedtrunc| mathdef:: \xref{exec/numerics}{op-relaxed_trunc}{\F{relaxed\_trunc}} +.. |irelaxedlaneselect| mathdef:: \xref{exec/numerics}{op-irelaxed_laneselect}{\F{irelaxed\_laneselect}} +.. |frelaxedmin| mathdef:: \xref{exec/numerics}{op-frelaxed_min}{\F{frelaxed\_min}} +.. |frelaxedmax| mathdef:: \xref{exec/numerics}{op-frelaxed_max}{\F{frelaxed\_max}} +.. |irelaxedq15mulrs| mathdef:: \xref{exec/numerics}{op-irelaxed_q15mulr_s}{\F{irelaxed\_q15mulr\_s}} +.. |irelaxeddotmul| mathdef:: \xref{exec/numerics}{op-irelaxed_dot_mul}{\F{irelaxed\_dot\_mul}} + .. Numerics, meta functions @@ -1523,6 +1559,8 @@ .. |lanes| mathdef:: \xref{exec/numerics}{aux-lanes}{\F{lanes}} +.. |relaxed| mathdef:: \xref{exec/numerics}{aux-relaxed}{\F{relaxed}} + .. Other meta functions diff --git a/document/core/valid/instructions.rst b/document/core/valid/instructions.rst index 43b55df47..9f7b29c04 100644 --- a/document/core/valid/instructions.rst +++ b/document/core/valid/instructions.rst @@ -913,6 +913,20 @@ The following auxiliary function denotes the number of lanes in a vector shape, } +.. _valid-relaxed_swizzle: + +:math:`\K{i8x16.}\RELAXEDSWIZZLE` +................................. + +* The instruction is valid with type :math:`[\V128~\V128] \to [\V128]`. + +.. math:: + \frac{ + }{ + C \vdashinstr \K{i8x16.}\RELAXEDSWIZZLE : [\V128~\V128] \to [\V128] + } + + .. _valid-vec-shuffle: :math:`\K{i8x16.}\SHUFFLE~\laneidx^{16}` @@ -1010,6 +1024,34 @@ The following auxiliary function denotes the number of lanes in a vector shape, } +.. _valid-vternop: + +:math:`\shape\K{.}\vternop` +........................... + +* The instruction is valid with type :math:`[\V128~\V128~\V128] \to [\V128]`. + +.. math:: + \frac{ + }{ + C \vdashinstr \shape\K{.}\vternop : [\V128~\V128~\V128] \to [\V128] + } + + +.. _valid-relaxed_laneselect: + +:math:`\shape\K{.}\RELAXEDLANESELECT` +..................................... + +* The instruction is valid with type :math:`[\V128~\V128~\V128] \to [\V128]`. + +.. math:: + \frac{ + }{ + C \vdashinstr \shape\K{.}\RELAXEDLANESELECT : [\V128~\V128~\V128] \to [\V128] + } + + .. _valid-vrelop: :math:`\shape\K{.}\vrelop` @@ -1108,6 +1150,31 @@ The following auxiliary function denotes the number of lanes in a vector shape, } +.. _valid-relaxed_dot: + +:math:`\ishape_1\K{.}\DOT\K{\_}\ishape_2\_\K{i7x16\_s}` +....................................................... + +* The instruction is valid with type :math:`[\V128~\V128] \to [\V128]`. + +.. math:: + \frac{ + }{ + C \vdashinstr \ishape_1\K{.}\DOT\K{\_}\ishape_2\_\K{i7x16\_s} : [\V128~\V128] \to [\V128] + } + +:math:`\ishape_1\K{.}\DOT\K{\_}\ishape_2\_\K{i7x16\_add\_\_s}` +.............................................................. + +* The instruction is valid with type :math:`[\V128~\V128~\V128] \to [\V128]`. + +.. math:: + \frac{ + }{ + C \vdashinstr \ishape_1\K{.}\DOT\K{\_}\ishape_2\_\K{i7x16\_add\_\_s} : [\V128~\V128~\V128] \to [\V128] + } + + .. _valid-vec-extmul: :math:`\ishape_1\K{.}\EXTMUL\K{\_}\half\K{\_}\ishape_2\K{\_}\sx` diff --git a/interpreter/README.md b/interpreter/README.md index fc37dc14f..f95f4b6bb 100644 --- a/interpreter/README.md +++ b/interpreter/README.md @@ -471,6 +471,7 @@ result_pat: ( ref.func ) ( ref.extern ) ( ref. ) + ( either + ) ;; alternative results num_pat: ;; literal result @@ -574,6 +575,7 @@ result_pat: ( ref.func ) ( ref.extern ) ( ref. ) + ( either + ) ;; alternative results num_pat: ;; literal result diff --git a/interpreter/binary/decode.ml b/interpreter/binary/decode.ml index 109a5a2de..4ca94d68b 100644 --- a/interpreter/binary/decode.ml +++ b/interpreter/binary/decode.ml @@ -965,6 +965,26 @@ let rec instr s = | 0xfdl -> i32x4_trunc_sat_f64x2_u_zero | 0xfel -> f64x2_convert_low_i32x4_s | 0xffl -> f64x2_convert_low_i32x4_u + | 0x100l -> i8x16_relaxed_swizzle + | 0x101l -> i32x4_relaxed_trunc_f32x4_s + | 0x102l -> i32x4_relaxed_trunc_f32x4_u + | 0x103l -> i32x4_relaxed_trunc_f64x2_s_zero + | 0x104l -> i32x4_relaxed_trunc_f64x2_u_zero + | 0x105l -> f32x4_relaxed_madd + | 0x106l -> f32x4_relaxed_nmadd + | 0x107l -> f64x2_relaxed_madd + | 0x108l -> f64x2_relaxed_nmadd + | 0x109l -> i8x16_relaxed_laneselect + | 0x10al -> i16x8_relaxed_laneselect + | 0x10bl -> i32x4_relaxed_laneselect + | 0x10cl -> i64x2_relaxed_laneselect + | 0x10dl -> f32x4_relaxed_min + | 0x10el -> f32x4_relaxed_max + | 0x10fl -> f64x2_relaxed_min + | 0x110l -> f64x2_relaxed_max + | 0x111l -> i16x8_relaxed_q15mulr_s + | 0x112l -> i16x8_relaxed_dot_i8x16_i7x16_s + | 0x113l -> i32x4_relaxed_dot_i8x16_i7x16_add_s | n -> illegal s pos (I32.to_int_u n) ) diff --git a/interpreter/binary/encode.ml b/interpreter/binary/encode.ml index 93ed69b2d..aca2a5454 100644 --- a/interpreter/binary/encode.ml +++ b/interpreter/binary/encode.ml @@ -742,6 +742,7 @@ struct | VecBinary (V128 (I8x16 V128Op.MaxS)) -> vecop 0x78l | VecBinary (V128 (I8x16 V128Op.MaxU)) -> vecop 0x79l | VecBinary (V128 (I8x16 V128Op.AvgrU)) -> vecop 0x7bl + | VecBinary (V128 (I8x16 V128Op.RelaxedSwizzle)) -> vecop 0x100l | VecBinary (V128 (I16x8 V128Op.NarrowS)) -> vecop 0x85l | VecBinary (V128 (I16x8 V128Op.NarrowU)) -> vecop 0x86l | VecBinary (V128 (I16x8 V128Op.Add)) -> vecop 0x8el @@ -761,6 +762,8 @@ struct | VecBinary (V128 (I16x8 V128Op.ExtMulLowU)) -> vecop 0x9el | VecBinary (V128 (I16x8 V128Op.ExtMulHighU)) -> vecop 0x9fl | VecBinary (V128 (I16x8 V128Op.Q15MulRSatS)) -> vecop 0x82l + | VecBinary (V128 (I16x8 V128Op.RelaxedQ15MulRS)) -> vecop 0x111l + | VecBinary (V128 (I16x8 V128Op.RelaxedDot)) -> vecop 0x112l | VecBinary (V128 (I32x4 V128Op.Add)) -> vecop 0xael | VecBinary (V128 (I32x4 V128Op.Sub)) -> vecop 0xb1l | VecBinary (V128 (I32x4 V128Op.MinS)) -> vecop 0xb6l @@ -788,6 +791,8 @@ struct | VecBinary (V128 (F32x4 V128Op.Max)) -> vecop 0xe9l | VecBinary (V128 (F32x4 V128Op.Pmin)) -> vecop 0xeal | VecBinary (V128 (F32x4 V128Op.Pmax)) -> vecop 0xebl + | VecBinary (V128 (F32x4 V128Op.RelaxedMin)) -> vecop 0x10dl + | VecBinary (V128 (F32x4 V128Op.RelaxedMax)) -> vecop 0x10el | VecBinary (V128 (F64x2 V128Op.Add)) -> vecop 0xf0l | VecBinary (V128 (F64x2 V128Op.Sub)) -> vecop 0xf1l | VecBinary (V128 (F64x2 V128Op.Mul)) -> vecop 0xf2l @@ -796,9 +801,23 @@ struct | VecBinary (V128 (F64x2 V128Op.Max)) -> vecop 0xf5l | VecBinary (V128 (F64x2 V128Op.Pmin)) -> vecop 0xf6l | VecBinary (V128 (F64x2 V128Op.Pmax)) -> vecop 0xf7l + | VecBinary (V128 (F64x2 V128Op.RelaxedMin)) -> vecop 0x10fl + | VecBinary (V128 (F64x2 V128Op.RelaxedMax)) -> vecop 0x110l | VecBinary (V128 _) -> error e.at "illegal binary vector instruction" + | VecTernary (V128 (F32x4 V128Op.RelaxedMadd)) -> vecop 0x105l + | VecTernary (V128 (F32x4 V128Op.RelaxedNmadd)) -> vecop 0x106l + | VecTernary (V128 (F64x2 V128Op.RelaxedMadd)) -> vecop 0x107l + | VecTernary (V128 (F64x2 V128Op.RelaxedNmadd)) -> vecop 0x108l + | VecTernary (V128 (I8x16 V128Op.RelaxedLaneselect)) -> vecop 0x109l + | VecTernary (V128 (I16x8 V128Op.RelaxedLaneselect)) -> vecop 0x10al + | VecTernary (V128 (I32x4 V128Op.RelaxedLaneselect)) -> vecop 0x10bl + | VecTernary (V128 (I64x2 V128Op.RelaxedLaneselect)) -> vecop 0x10cl + | VecTernary (V128 (I32x4 V128Op.RelaxedDotAccum)) -> vecop 0x113l + | VecTernary (V128 _) -> + error e.at "illegal ternary vector instruction" + | VecConvert (V128 (I8x16 _)) -> error e.at "illegal i8x16 conversion instruction" | VecConvert (V128 (I16x8 V128Op.ExtendLowS)) -> vecop 0x87l @@ -819,6 +838,10 @@ struct | VecConvert (V128 (I32x4 V128Op.TruncSatUF32x4)) -> vecop 0xf9l | VecConvert (V128 (I32x4 V128Op.TruncSatSZeroF64x2)) -> vecop 0xfcl | VecConvert (V128 (I32x4 V128Op.TruncSatUZeroF64x2)) -> vecop 0xfdl + | VecConvert (V128 (I32x4 V128Op.RelaxedTruncSF32x4)) -> vecop 0x101l + | VecConvert (V128 (I32x4 V128Op.RelaxedTruncUF32x4)) -> vecop 0x102l + | VecConvert (V128 (I32x4 V128Op.RelaxedTruncSZeroF64x2)) -> vecop 0x103l + | VecConvert (V128 (I32x4 V128Op.RelaxedTruncUZeroF64x2)) -> vecop 0x104l | VecConvert (V128 (I64x2 V128Op.ExtendLowS)) -> vecop 0xc7l | VecConvert (V128 (I64x2 V128Op.ExtendHighS)) -> vecop 0xc8l | VecConvert (V128 (I64x2 V128Op.ExtendLowU)) -> vecop 0xc9l diff --git a/interpreter/exec/eval.ml b/interpreter/exec/eval.ml index fd2573e25..2d632dc9f 100644 --- a/interpreter/exec/eval.ml +++ b/interpreter/exec/eval.ml @@ -987,6 +987,10 @@ let rec step (c : config) : config = (try Vec (Eval_vec.eval_binop binop n1 n2) :: vs', [] with exn -> vs', [Trapping (numeric_error e.at exn) @@ e.at]) + | VecTernary ternop, Vec v3 :: Vec v2 :: Vec v1 :: vs' -> + (try Vec (Eval_vec.eval_ternop ternop v1 v2 v3) :: vs', [] + with exn -> vs', [Trapping (numeric_error e.at exn) @@ e.at]) + | VecCompare relop, Vec n2 :: Vec n1 :: vs' -> (try Vec (Eval_vec.eval_relop relop n1 n2) :: vs', [] with exn -> vs', [Trapping (numeric_error e.at exn) @@ e.at]) diff --git a/interpreter/exec/eval_vec.ml b/interpreter/exec/eval_vec.ml index a3b9986ff..2296ed94f 100644 --- a/interpreter/exec/eval_vec.ml +++ b/interpreter/exec/eval_vec.ml @@ -61,6 +61,7 @@ struct | I8x16 MaxS -> V128.I8x16.max_s | I8x16 MaxU -> V128.I8x16.max_u | I8x16 AvgrU -> V128.I8x16.avgr_u + | I8x16 RelaxedSwizzle -> V128.V8x16.swizzle | I16x8 NarrowS -> V128.I16x8_convert.narrow_s | I16x8 NarrowU -> V128.I16x8_convert.narrow_u | I16x8 Add -> V128.I16x8.add @@ -80,6 +81,8 @@ struct | I16x8 ExtMulLowU -> V128.I16x8_convert.extmul_low_u | I16x8 ExtMulHighU -> V128.I16x8_convert.extmul_high_u | I16x8 Q15MulRSatS -> V128.I16x8.q15mulr_sat_s + | I16x8 RelaxedQ15MulRS -> V128.I16x8.q15mulr_sat_s + | I16x8 RelaxedDot -> V128.I16x8_convert.dot_s | I32x4 Add -> V128.I32x4.add | I32x4 Sub -> V128.I32x4.sub | I32x4 MinS -> V128.I32x4.min_s @@ -107,6 +110,8 @@ struct | F32x4 Max -> V128.F32x4.max | F32x4 Pmin -> V128.F32x4.pmin | F32x4 Pmax -> V128.F32x4.pmax + | F32x4 RelaxedMin -> V128.F32x4.min + | F32x4 RelaxedMax -> V128.F32x4.max | F64x2 Add -> V128.F64x2.add | F64x2 Sub -> V128.F64x2.sub | F64x2 Mul -> V128.F64x2.mul @@ -115,9 +120,25 @@ struct | F64x2 Max -> V128.F64x2.max | F64x2 Pmin -> V128.F64x2.pmin | F64x2 Pmax -> V128.F64x2.pmax + | F64x2 RelaxedMin -> V128.F64x2.min + | F64x2 RelaxedMax -> V128.F64x2.max | _ -> assert false in fun v1 v2 -> to_vec (f (of_vec 1 v1) (of_vec 2 v2)) + let ternop (op : ternop) = + let f = match op with + | F32x4 RelaxedMadd -> V128.F32x4.fma + | F32x4 RelaxedNmadd -> V128.F32x4.fnma + | F64x2 RelaxedMadd -> V128.F64x2.fma + | F64x2 RelaxedNmadd -> V128.F64x2.fnma + | I8x16 RelaxedLaneselect -> V128.V1x128.bitselect + | I16x8 RelaxedLaneselect -> V128.V1x128.bitselect + | I32x4 RelaxedLaneselect -> V128.V1x128.bitselect + | I64x2 RelaxedLaneselect -> V128.V1x128.bitselect + | I32x4 RelaxedDotAccum -> V128.I32x4_convert.dot_s_accum + | _ -> assert false + in fun v1 v2 v3 -> to_vec (f (of_vec 1 v1) (of_vec 2 v2) (of_vec 3 v3)) + let relop (op : relop) = let f = match op with | I8x16 Eq -> V128.I8x16.eq @@ -187,6 +208,10 @@ struct | I32x4 TruncSatUF32x4 -> V128.I32x4_convert.trunc_sat_f32x4_u | I32x4 TruncSatSZeroF64x2 -> V128.I32x4_convert.trunc_sat_f64x2_s_zero | I32x4 TruncSatUZeroF64x2 -> V128.I32x4_convert.trunc_sat_f64x2_u_zero + | I32x4 RelaxedTruncSF32x4 -> V128.I32x4_convert.trunc_sat_f32x4_s + | I32x4 RelaxedTruncUF32x4 -> V128.I32x4_convert.trunc_sat_f32x4_u + | I32x4 RelaxedTruncSZeroF64x2 -> V128.I32x4_convert.trunc_sat_f64x2_s_zero + | I32x4 RelaxedTruncUZeroF64x2 -> V128.I32x4_convert.trunc_sat_f64x2_u_zero | I32x4 ExtAddPairwiseS -> V128.I32x4_convert.extadd_pairwise_s | I32x4 ExtAddPairwiseU -> V128.I32x4_convert.extadd_pairwise_u | I64x2 ExtendLowS -> V128.I64x2_convert.extend_low_s @@ -301,6 +326,7 @@ let op v128 = function let eval_testop = op V128Op.testop let eval_unop = op V128Op.unop let eval_binop = op V128Op.binop +let eval_ternop = op V128Op.ternop let eval_relop = op V128Op.relop let eval_cvtop = op V128Op.cvtop let eval_shiftop = op V128Op.shiftop diff --git a/interpreter/exec/eval_vec.mli b/interpreter/exec/eval_vec.mli index 45b59033e..f4601bd01 100644 --- a/interpreter/exec/eval_vec.mli +++ b/interpreter/exec/eval_vec.mli @@ -3,6 +3,7 @@ open Value val eval_testop : Ast.vec_testop -> vec -> bool val eval_unop : Ast.vec_unop -> vec -> vec val eval_binop : Ast.vec_binop -> vec -> vec -> vec +val eval_ternop : Ast.vec_ternop -> vec -> vec -> vec -> vec val eval_relop : Ast.vec_relop -> vec -> vec -> vec val eval_cvtop : Ast.vec_cvtop -> vec -> vec val eval_shiftop : Ast.vec_shiftop -> vec -> num -> vec diff --git a/interpreter/exec/fxx.ml b/interpreter/exec/fxx.ml index 2385d194d..f94b387a2 100644 --- a/interpreter/exec/fxx.ml +++ b/interpreter/exec/fxx.ml @@ -43,6 +43,7 @@ sig val sub : t -> t -> t val mul : t -> t -> t val div : t -> t -> t + val fma : t -> t -> t -> t val sqrt : t -> t val min : t -> t -> t val max : t -> t -> t @@ -138,6 +139,13 @@ struct let mul x y = binary x ( *.) y let div x y = binary x (/.) y + let fma x y z = + let xf = to_float x in + let yf = to_float y in + let zf = to_float z in + let t = Float.fma xf yf zf in + if t = t then of_float t else determine_binary_nan x y + let sqrt x = unary Stdlib.sqrt x let ceil x = unary Stdlib.ceil x diff --git a/interpreter/exec/v128.ml b/interpreter/exec/v128.ml index c508f6b94..a4c4e85a8 100644 --- a/interpreter/exec/v128.ml +++ b/interpreter/exec/v128.ml @@ -191,6 +191,8 @@ sig val sub : t -> t -> t val mul : t -> t -> t val div : t -> t -> t + val fma : t -> t -> t -> t + val fnma : t -> t -> t -> t val min : t -> t -> t val max : t -> t -> t val pmin : t -> t -> t @@ -233,6 +235,9 @@ struct let sub = binop FXX.sub let mul = binop FXX.mul let div = binop FXX.div + let fma x y z = + of_lanes (Lib.List.map3 FXX.fma (to_lanes x) (to_lanes y) (to_lanes z)) + let fnma x y z = fma (unop FXX.neg x) y z let min = binop FXX.min let max = binop FXX.max let pmin = binop (fun x y -> if FXX.lt y x then y else x) @@ -375,9 +380,20 @@ struct let extadd ext x y = Int32.add (ext x) (ext y) let extadd_pairwise_s x = - I16x8.of_lanes (Lib.List.pairwise (extadd ext_s) (I8x16.to_lanes x)) + I16x8.of_lanes (Lib.List.map_pairwise (extadd ext_s) (I8x16.to_lanes x)) let extadd_pairwise_u x = - I16x8.of_lanes (Lib.List.pairwise (extadd ext_u) (I8x16.to_lanes x)) + I16x8.of_lanes (Lib.List.map_pairwise (extadd ext_u) (I8x16.to_lanes x)) + + let dot_s x y = + let xs = I8x16.to_lanes x in + let ys = I8x16.to_lanes y in + let rec dot xs ys = + match xs, ys with + | x1::x2::xs', y1::y2::ys' -> + Int32.(add (mul x1 y1) (mul x2 y2)) :: dot xs' ys' + | [], [] -> [] + | _, _ -> assert false + in I16x8.of_lanes (dot xs ys) end module I32x4_convert = @@ -412,6 +428,20 @@ struct | _, _ -> assert false in I32x4.of_lanes (dot xs ys) + let dot_s_accum x y z = + let xs = I8x16.to_lanes x in + let ys = I8x16.to_lanes y in + let rec dot xs ys = + match xs, ys with + | x1::x2::x3::x4::xs', y1::y2::y3::y4::ys' -> + Int32.(add + (add (mul x1 y1) (mul x2 y2)) + (add (mul x3 y3) (mul x4 y4))) + :: dot xs' ys' + | [], [] -> [] + | _, _ -> assert false + in I32x4.add (I32x4.of_lanes (dot xs ys)) z + let extmul_low_s x y = I32x4.mul (extend_low_s x) (extend_low_s y) let extmul_high_s x y = I32x4.mul (extend_high_s x) (extend_high_s y) let extmul_low_u x y = I32x4.mul (extend_low_u x) (extend_low_u y) @@ -419,9 +449,9 @@ struct let extadd ext x y = Int32.add (ext x) (ext y) let extadd_pairwise_s x = - I32x4.of_lanes (Lib.List.pairwise (extadd ext_s) (I16x8.to_lanes x)) + I32x4.of_lanes (Lib.List.map_pairwise (extadd ext_s) (I16x8.to_lanes x)) let extadd_pairwise_u x = - I32x4.of_lanes (Lib.List.pairwise (extadd ext_u) (I16x8.to_lanes x)) + I32x4.of_lanes (Lib.List.map_pairwise (extadd ext_u) (I16x8.to_lanes x)) end module I64x2_convert = diff --git a/interpreter/exec/v128.mli b/interpreter/exec/v128.mli index f9535c158..e29477945 100644 --- a/interpreter/exec/v128.mli +++ b/interpreter/exec/v128.mli @@ -108,6 +108,8 @@ sig val sub : t -> t -> t val mul : t -> t -> t val div : t -> t -> t + val fma : t -> t -> t -> t + val fnma : t -> t -> t -> t val min : t -> t -> t val max : t -> t -> t val pmin : t -> t -> t @@ -163,6 +165,7 @@ sig val extmul_high_u : t -> t -> t val extadd_pairwise_s : t -> t val extadd_pairwise_u : t -> t + val dot_s : t -> t -> t end module I32x4_convert : @@ -176,6 +179,7 @@ sig val extend_low_u : t -> t val extend_high_u : t -> t val dot_s : t -> t -> t + val dot_s_accum : t -> t -> t -> t val extmul_low_s : t -> t -> t val extmul_high_s : t -> t -> t val extmul_low_u : t -> t -> t diff --git a/interpreter/script/js.ml b/interpreter/script/js.ml index c0bd6f2a2..21c4ebbaa 100644 --- a/interpreter/script/js.ml +++ b/interpreter/script/js.ml @@ -239,7 +239,7 @@ let env () : env = let current_mod (env : env) = "$$" ^ string_of_int env.current_mod let of_mod_opt (env : env) = function | None -> current_mod env - | Some x -> x.it + | Some x -> "$" ^ x.it let current_inst (env : env) = "$" ^ string_of_int env.current_inst let of_inst_opt (env : env) = function @@ -343,14 +343,24 @@ let type_of_ref_pat = function | RefTypePat ht -> (NoNull, ht) | NullPat -> (Null, BotHT) -let type_of_result res = +let rec type_of_result res = match res.it with | NumResult pat -> NumT (type_of_num_pat pat) | VecResult pat -> VecT (type_of_vec_pat pat) | RefResult pat -> RefT (type_of_ref_pat pat) + | EitherResult rs -> + let ts = List.map type_of_result rs in + List.fold_left (fun t1 t2 -> + if Match.match_val_type [] t1 t2 then t2 else + if Match.match_val_type [] t2 t1 then t1 else + if Match.(top_of_val_type [] t1 = top_of_val_type [] t2) then + Match.top_of_val_type [] t1 + else + BotT (* should really be Top, but we don't have that :) *) + ) (List.hd ts) ts let assert_return ress ts at = - let test (res, t) = + let rec test (res, t) = if not (Match.match_val_type [] t (type_of_result res)) then [ Br (0l @@ at) @@ at ] else @@ -442,6 +452,17 @@ let assert_return ress ts at = [ RefIsNull @@ at; Test (I32 I32Op.Eqz) @@ at; BrIf (0l @@ at) @@ at ] + | EitherResult ress -> + [ Block (ValBlockType None, + List.map (fun resI -> + Block (ValBlockType None, + test (resI, t) @ + [Br (1l @@ resI.at) @@ resI.at] + ) @@ resI.at + ) ress @ + [Br (1l @@ at) @@ at] + ) @@ at + ] in [], List.flatten (List.rev_map test (List.combine ress ts)) let i32 = NumT I32T @@ -588,11 +609,13 @@ let of_ref_pat = function | RefTypePat t -> "\"ref." ^ string_of_heap_type t ^ "\"" | NullPat -> "\"ref.null\"" -let of_result res = +let rec of_result res = match res.it with | NumResult np -> of_num_pat np | VecResult vp -> of_vec_pat vp | RefResult rp -> of_ref_pat rp + | EitherResult ress -> + "[" ^ String.concat ", " (List.map of_result ress) ^ "]" let rec of_definition def = match def.it with @@ -602,9 +625,6 @@ let rec of_definition def = try of_definition (snd (Parse.Module.parse_string ~offset:s.at s.it)) with Parse.Syntax _ | Custom.Syntax _ -> of_bytes "" -let of_instance env x_opt = - "instance(" ^ of_mod_opt env x_opt ^ ")" - let of_wrapper env x_opt name wrap_action wrap_assertion at = let x = of_inst_opt env x_opt in let bs = wrap name wrap_action wrap_assertion at in @@ -657,9 +677,9 @@ let of_assertion env ass = | AssertInvalidCustom (def, _) -> "assert_invalid_custom(" ^ of_definition def ^ ");" | AssertUnlinkable (x_opt, _) -> - "assert_unlinkable(" ^ of_instance env x_opt ^ ");" + "assert_unlinkable(" ^ of_mod_opt env x_opt ^ ");" | AssertUninstantiable (x_opt, _) -> - "assert_uninstantiable(" ^ of_instance env x_opt ^ ");" + "assert_uninstantiable(" ^ of_mod_opt env x_opt ^ ");" | AssertReturn (act, ress) -> of_assertion' env act "assert_return" (List.map of_result ress) (Some (assert_return ress)) @@ -682,7 +702,7 @@ let of_command env cmd = | Quoted (_, s) -> unquote (snd (Parse.Module.parse_string ~offset:s.at s.it)) in bind_mod env x_opt (unquote def); - "let " ^ current_mod env ^ " = " ^ of_definition def ^ ";\n" ^ + "let " ^ current_mod env ^ " = module(" ^ of_definition def ^ ");\n" ^ (if x_opt = None then "" else "let " ^ of_mod_opt env x_opt ^ " = " ^ current_mod env ^ ";\n") | Instance (x1_opt, x2_opt) -> diff --git a/interpreter/script/run.ml b/interpreter/script/run.ml index 81496b632..3a2a7c91a 100644 --- a/interpreter/script/run.ml +++ b/interpreter/script/run.ml @@ -234,16 +234,6 @@ let string_of_nan = function | CanonicalNan -> "nan:canonical" | ArithmeticNan -> "nan:arithmetic" -let type_of_result r = - let open Types in - match r with - | NumResult (NumPat n) -> NumT (Value.type_of_num n.it) - | NumResult (NanPat n) -> NumT (Value.type_of_num n.it) - | VecResult (VecPat v) -> VecT (Value.type_of_vec v) - | RefResult (RefPat r) -> RefT (Value.type_of_ref r.it) - | RefResult (RefTypePat t) -> RefT (NoNull, t) (* assume closed *) - | RefResult (NullPat) -> RefT (Null, ExternHT) - let string_of_num_pat (p : num_pat) = match p with | NumPat n -> Value.string_of_num n.it @@ -263,16 +253,38 @@ let string_of_ref_pat (p : ref_pat) = | RefTypePat t -> Types.string_of_heap_type t | NullPat -> "null" -let string_of_result r = - match r with +let rec string_of_result r = + match r.it with | NumResult np -> string_of_num_pat np | VecResult vp -> string_of_vec_pat vp | RefResult rp -> string_of_ref_pat rp + | EitherResult rs -> + "(" ^ String.concat " | " (List.map string_of_result rs) ^ ")" let string_of_results = function | [r] -> string_of_result r | rs -> "[" ^ String.concat " " (List.map string_of_result rs) ^ "]" +let rec type_of_result r = + let open Types in + match r.it with + | NumResult (NumPat n) -> NumT (Value.type_of_num n.it) + | NumResult (NanPat n) -> NumT (Value.type_of_num n.it) + | VecResult (VecPat v) -> VecT (Value.type_of_vec v) + | RefResult (RefPat r) -> RefT (Value.type_of_ref r.it) + | RefResult (RefTypePat t) -> RefT (NoNull, t) (* assume closed *) + | RefResult (NullPat) -> RefT (Null, ExternHT) + | EitherResult rs -> + let ts = List.map type_of_result rs in + List.fold_left (fun t1 t2 -> + if Match.match_val_type [] t1 t2 then t2 else + if Match.match_val_type [] t2 t1 then t1 else + if Match.(top_of_val_type [] t1 = top_of_val_type [] t2) then + Match.top_of_val_type [] t1 + else + BotT (* should really be Top, but we don't have that :) *) + ) (List.hd ts) ts + let print_results rs = let ts = List.map type_of_result rs in Printf.printf "%s : %s\n%!" @@ -406,18 +418,19 @@ let assert_ref_pat r p = | NullPat, Value.NullRef _ -> true | _ -> false -let assert_pat v r = +let rec assert_result v r = let open Value in - match v, r with + match v, r.it with | Num n, NumResult np -> assert_num_pat n np | Vec v, VecResult vp -> assert_vec_pat v vp | Ref r, RefResult rp -> assert_ref_pat r rp + | _, EitherResult rs -> List.exists (assert_result v) rs | _, _ -> false -let assert_result at got expect = +let assert_results at got expect = if List.length got <> List.length expect || - List.exists2 (fun v r -> not (assert_pat v r)) got expect + not (List.for_all2 assert_result got expect) then begin print_string "Result: "; print_values got; print_string "Expect: "; print_results expect; @@ -503,8 +516,7 @@ let run_assertion ass = | AssertReturn (act, rs) -> trace ("Asserting return..."); let got_vs = run_action act in - let expect_rs = List.map (fun r -> r.it) rs in - assert_result ass.at got_vs expect_rs + assert_results ass.at got_vs rs | AssertException act -> trace ("Asserting exception..."); diff --git a/interpreter/script/script.ml b/interpreter/script/script.ml index 315905e31..cf7366ea5 100644 --- a/interpreter/script/script.ml +++ b/interpreter/script/script.ml @@ -37,6 +37,7 @@ and result' = | NumResult of num_pat | VecResult of vec_pat | RefResult of ref_pat + | EitherResult of result list type assertion = assertion' Source.phrase and assertion' = diff --git a/interpreter/syntax/ast.ml b/interpreter/syntax/ast.ml index 6cdbb7952..82942b356 100644 --- a/interpreter/syntax/ast.ml +++ b/interpreter/syntax/ast.ml @@ -64,13 +64,19 @@ struct | AddSatS | AddSatU | SubSatS | SubSatU | DotS | Q15MulRSatS | ExtMulLowS | ExtMulHighS | ExtMulLowU | ExtMulHighU | Swizzle | Shuffle of int list | NarrowS | NarrowU + | RelaxedSwizzle | RelaxedQ15MulRS | RelaxedDot type fbinop = Add | Sub | Mul | Div | Min | Max | Pmin | Pmax + | RelaxedMin | RelaxedMax + type iternop = RelaxedLaneselect | RelaxedDotAccum + type fternop = RelaxedMadd | RelaxedNmadd type irelop = Eq | Ne | LtS | LtU | LeS | LeU | GtS | GtU | GeS | GeU type frelop = Eq | Ne | Lt | Le | Gt | Ge type icvtop = ExtendLowS | ExtendLowU | ExtendHighS | ExtendHighU | ExtAddPairwiseS | ExtAddPairwiseU | TruncSatSF32x4 | TruncSatUF32x4 | TruncSatSZeroF64x2 | TruncSatUZeroF64x2 + | RelaxedTruncSF32x4 | RelaxedTruncUF32x4 + | RelaxedTruncSZeroF64x2 | RelaxedTruncUZeroF64x2 type fcvtop = DemoteZeroF64x2 | PromoteLowF32x4 | ConvertSI32x4 | ConvertUI32x4 type ishiftop = Shl | ShrS | ShrU @@ -84,6 +90,7 @@ struct type testop = (itestop, itestop, itestop, itestop, void, void) V128.laneop type unop = (iunop, iunop, iunop, iunop, funop, funop) V128.laneop type binop = (ibinop, ibinop, ibinop, ibinop, fbinop, fbinop) V128.laneop + type ternop = (iternop, iternop, iternop, iternop, fternop, fternop) V128.laneop type relop = (irelop, irelop, irelop, irelop, frelop, frelop) V128.laneop type cvtop = (icvtop, icvtop, icvtop, icvtop, fcvtop, fcvtop) V128.laneop type shiftop = (ishiftop, ishiftop, ishiftop, ishiftop, void, void) V128.laneop @@ -108,6 +115,7 @@ type vec_testop = (V128Op.testop) Value.vecop type vec_relop = (V128Op.relop) Value.vecop type vec_unop = (V128Op.unop) Value.vecop type vec_binop = (V128Op.binop) Value.vecop +type vec_ternop = (V128Op.ternop) Value.vecop type vec_cvtop = (V128Op.cvtop) Value.vecop type vec_shiftop = (V128Op.shiftop) Value.vecop type vec_bitmaskop = (V128Op.bitmaskop) Value.vecop @@ -226,6 +234,7 @@ and instr' = | VecCompare of vec_relop (* vector comparison *) | VecUnary of vec_unop (* unary vector operator *) | VecBinary of vec_binop (* binary vector operator *) + | VecTernary of vec_ternop (* ternary vector operator *) | VecConvert of vec_cvtop (* vector conversion *) | VecShift of vec_shiftop (* vector shifts *) | VecBitmask of vec_bitmaskop (* vector masking *) diff --git a/interpreter/syntax/free.ml b/interpreter/syntax/free.ml index 07d9d547e..72314487f 100644 --- a/interpreter/syntax/free.ml +++ b/interpreter/syntax/free.ml @@ -187,7 +187,8 @@ let rec instr (e : instr) = | MemoryCopy (x, y) -> memories (idx x) ++ memories (idx y) | MemoryInit (x, y) -> memories (idx x) ++ datas (idx y) | DataDrop x -> datas (idx x) - | VecConst _ | VecTest _ | VecUnary _ | VecBinary _ | VecCompare _ + | VecConst _ | VecTest _ + | VecUnary _ | VecBinary _ | VecTernary _ | VecCompare _ | VecConvert _ | VecShift _ | VecBitmask _ | VecTestBits _ | VecUnaryBits _ | VecBinaryBits _ | VecTernaryBits _ | VecSplat _ | VecExtract _ | VecReplace _ -> diff --git a/interpreter/syntax/operators.ml b/interpreter/syntax/operators.ml index 9a467b1cb..2f9879ef8 100644 --- a/interpreter/syntax/operators.ml +++ b/interpreter/syntax/operators.ml @@ -560,3 +560,24 @@ let f64x2_pmax = VecBinary (V128 (F64x2 V128Op.Pmax)) let f64x2_promote_low_f32x4 = VecConvert (V128 (F64x2 V128Op.PromoteLowF32x4)) let f64x2_convert_low_i32x4_s = VecConvert (V128 (F64x2 V128Op.ConvertSI32x4)) let f64x2_convert_low_i32x4_u = VecConvert (V128 (F64x2 V128Op.ConvertUI32x4)) + +let i8x16_relaxed_swizzle = VecBinary (V128 (I8x16 V128Op.RelaxedSwizzle)) +let i8x16_relaxed_laneselect = VecTernary (V128 (I8x16 V128Op.RelaxedLaneselect)) +let i16x8_relaxed_q15mulr_s = VecBinary (V128 (I16x8 V128Op.RelaxedQ15MulRS)) +let i16x8_relaxed_laneselect = VecTernary (V128 (I16x8 V128Op.RelaxedLaneselect)) +let i32x4_relaxed_trunc_f32x4_s = VecConvert (V128 (I32x4 V128Op.RelaxedTruncSF32x4)) +let i32x4_relaxed_trunc_f32x4_u = VecConvert (V128 (I32x4 V128Op.RelaxedTruncUF32x4)) +let i32x4_relaxed_trunc_f64x2_s_zero = VecConvert (V128 (I32x4 V128Op.RelaxedTruncSZeroF64x2)) +let i32x4_relaxed_trunc_f64x2_u_zero = VecConvert (V128 (I32x4 V128Op.RelaxedTruncUZeroF64x2)) +let i32x4_relaxed_laneselect = VecTernary (V128 (I32x4 V128Op.RelaxedLaneselect)) +let i64x2_relaxed_laneselect = VecTernary (V128 (I64x2 V128Op.RelaxedLaneselect)) +let f32x4_relaxed_madd = VecTernary (V128 (F32x4 V128Op.RelaxedMadd)) +let f32x4_relaxed_nmadd = VecTernary (V128 (F32x4 V128Op.RelaxedNmadd)) +let f32x4_relaxed_min = VecBinary (V128 (F32x4 V128Op.RelaxedMin)) +let f32x4_relaxed_max = VecBinary (V128 (F32x4 V128Op.RelaxedMax)) +let f64x2_relaxed_madd = VecTernary (V128 (F64x2 V128Op.RelaxedMadd)) +let f64x2_relaxed_nmadd = VecTernary (V128 (F64x2 V128Op.RelaxedNmadd)) +let f64x2_relaxed_min = VecBinary (V128 (F64x2 V128Op.RelaxedMin)) +let f64x2_relaxed_max = VecBinary (V128 (F64x2 V128Op.RelaxedMax)) +let i16x8_relaxed_dot_i8x16_i7x16_s = VecBinary (V128 (I16x8 V128Op.RelaxedDot)) +let i32x4_relaxed_dot_i8x16_i7x16_add_s = VecTernary (V128 (I32x4 V128Op.RelaxedDotAccum)) diff --git a/interpreter/text/arrange.ml b/interpreter/text/arrange.ml index 3d5ac0ba6..fe9fbf2a4 100644 --- a/interpreter/text/arrange.ml +++ b/interpreter/text/arrange.ml @@ -259,6 +259,10 @@ struct | "32x4" -> "64x2" | _ -> assert false + let without_high_bit = function + | "8x16" -> "7x16" + | _ -> assert false + let voidop xxxx = function (_ : void) -> . let itestop xxxx (op : itestop) = match op with @@ -301,6 +305,13 @@ struct | NarrowU -> "narrow_i" ^ double xxxx ^ "_u" | Shuffle is -> "shuffle " ^ String.concat " " (List.map nat is) | Swizzle -> "swizzle" + | RelaxedSwizzle -> "relaxed_swizzle" + | RelaxedQ15MulRS -> "relaxed_q15mulr_s" + | RelaxedDot -> "relaxed_dot_i" ^ half xxxx ^ "_i" ^ without_high_bit (half xxxx) ^ "_s" + + let iternop xxxx (op : iternop) = match op with + | RelaxedLaneselect -> "relaxed_laneselect" + | RelaxedDotAccum -> "relaxed_dot_i" ^ half (half xxxx) ^ "_i" ^ without_high_bit (half (half xxxx)) ^ "_add_s" let fbinop xxxx (op : fbinop) = match op with | Add -> "add" @@ -311,6 +322,12 @@ struct | Max -> "max" | Pmin -> "pmin" | Pmax -> "pmax" + | RelaxedMin -> "relaxed_min" + | RelaxedMax -> "relaxed_max" + + let fternop xxxx (op : fternop) = match op with + | RelaxedMadd -> "relaxed_madd" + | RelaxedNmadd-> "relaxed_nmadd" let irelop xxxx (op : irelop) = match op with | Eq -> "eq" @@ -343,6 +360,10 @@ struct | TruncSatUF32x4 -> "trunc_sat_f32x4_u" | TruncSatSZeroF64x2 -> "trunc_sat_f64x2_s_zero" | TruncSatUZeroF64x2 -> "trunc_sat_f64x2_u_zero" + | RelaxedTruncSF32x4 -> "relaxed_trunc_f32x4_s" + | RelaxedTruncUF32x4 -> "relaxed_trunc_f32x4_u" + | RelaxedTruncSZeroF64x2 -> "relaxed_trunc_f64x2_s_zero" + | RelaxedTruncUZeroF64x2 -> "relaxed_trunc_f64x2_u_zero" let fcvtop xxxx (op : fcvtop) = match op with | DemoteZeroF64x2 -> "demote_f64x2_zero" @@ -422,6 +443,7 @@ let cvtop = oper (IntOp.cvtop, FloatOp.cvtop) let vec_unop = vec_shape_oper (V128Op.iunop, V128Op.iunop, V128Op.funop) let vec_binop = vec_shape_oper (V128Op.ibinop, V128Op.ibinop, V128Op.fbinop) +let vec_ternop = vec_shape_oper (V128Op.iternop, V128Op.iternop, V128Op.fternop) let vec_testop = vec_shape_oper (V128Op.itestop, V128Op.itestop, V128Op.voidop) let vec_relop = vec_shape_oper (V128Op.irelop, V128Op.irelop, V128Op.frelop) let vec_cvtop = vec_shape_oper (V128Op.icvtop, V128Op.icvtop, V128Op.fcvtop) @@ -583,6 +605,7 @@ let rec instr e = | VecTest op -> vec_testop op, [] | VecUnary op -> vec_unop op, [] | VecBinary op -> vec_binop op, [] + | VecTernary op -> vec_ternop op, [] | VecCompare op -> vec_relop op, [] | VecConvert op -> vec_cvtop op, [] | VecShift op -> vec_shiftop op, [] @@ -869,11 +892,12 @@ let ref_pat = function | RefTypePat t -> Node ("ref." ^ heap_type t, []) | NullPat -> Node ("ref.null", []) -let result mode res = +let rec result mode res = match res.it with | NumResult np -> num_pat mode np | VecResult vp -> vec_pat mode vp | RefResult rp -> ref_pat rp + | EitherResult ress -> Node ("either", List.map (result mode) ress) let instance (x1_opt, x2_opt) = Node ("module instance" ^ var_opt x1_opt ^ var_opt x2_opt, []) diff --git a/interpreter/text/lexer.mll b/interpreter/text/lexer.mll index 5b95a1249..02430dea7 100644 --- a/interpreter/text/lexer.mll +++ b/interpreter/text/lexer.mll @@ -742,6 +742,27 @@ rule token = parse | "f32x4.replace_lane" -> VEC_REPLACE f32x4_replace_lane | "f64x2.replace_lane" -> VEC_REPLACE f64x2_replace_lane + | "i8x16.relaxed_swizzle" -> VEC_BINARY i8x16_relaxed_swizzle + | "i32x4.relaxed_trunc_f32x4_u" -> VEC_UNARY i32x4_relaxed_trunc_f32x4_u + | "i32x4.relaxed_trunc_f32x4_s" -> VEC_UNARY i32x4_relaxed_trunc_f32x4_s + | "i32x4.relaxed_trunc_f64x2_u_zero" -> VEC_UNARY i32x4_relaxed_trunc_f64x2_u_zero + | "i32x4.relaxed_trunc_f64x2_s_zero" -> VEC_UNARY i32x4_relaxed_trunc_f64x2_s_zero + | "f32x4.relaxed_madd" -> VEC_UNARY f32x4_relaxed_madd + | "f64x2.relaxed_madd" -> VEC_UNARY f64x2_relaxed_madd + | "f32x4.relaxed_nmadd" -> VEC_UNARY f32x4_relaxed_nmadd + | "f64x2.relaxed_nmadd" -> VEC_UNARY f64x2_relaxed_nmadd + | "i8x16.relaxed_laneselect" -> VEC_TERNARY i8x16_relaxed_laneselect + | "i16x8.relaxed_laneselect" -> VEC_TERNARY i16x8_relaxed_laneselect + | "i32x4.relaxed_laneselect" -> VEC_TERNARY i32x4_relaxed_laneselect + | "i64x2.relaxed_laneselect" -> VEC_TERNARY i64x2_relaxed_laneselect + | "f32x4.relaxed_min" -> VEC_UNARY f32x4_relaxed_min + | "f64x2.relaxed_min" -> VEC_UNARY f64x2_relaxed_min + | "f32x4.relaxed_max" -> VEC_UNARY f32x4_relaxed_max + | "f64x2.relaxed_max" -> VEC_UNARY f64x2_relaxed_max + | "i16x8.relaxed_q15mulr_s" -> VEC_BINARY i16x8_relaxed_q15mulr_s + | "i16x8.relaxed_dot_i8x16_i7x16_s" -> VEC_BINARY i16x8_relaxed_dot_i8x16_i7x16_s + | "i32x4.relaxed_dot_i8x16_i7x16_add_s" -> VEC_BINARY i32x4_relaxed_dot_i8x16_i7x16_add_s + | "type" -> TYPE | "func" -> FUNC | "param" -> PARAM @@ -781,6 +802,7 @@ rule token = parse | "assert_exhaustion" -> ASSERT_EXHAUSTION | "nan:canonical" -> NAN Script.CanonicalNan | "nan:arithmetic" -> NAN Script.ArithmeticNan + | "either" -> EITHER | "input" -> INPUT | "output" -> OUTPUT diff --git a/interpreter/text/parser.mly b/interpreter/text/parser.mly index 536685b58..bd5d205ac 100644 --- a/interpreter/text/parser.mly +++ b/interpreter/text/parser.mly @@ -374,6 +374,7 @@ let parse_annots (m : module_) : Custom.section list = %token ASSERT_RETURN ASSERT_TRAP ASSERT_EXCEPTION ASSERT_EXHAUSTION %token ASSERT_MALFORMED_CUSTOM ASSERT_INVALID_CUSTOM %token NAN +%token EITHER %token INPUT OUTPUT %token EOF @@ -1567,6 +1568,7 @@ result : error (at $sloc) "wrong number of lane literals"; VecResult (VecPat (Value.V128 ($3, List.map (fun lit -> lit $3) $4))) @@ $sloc } + | LPAR EITHER result list(result) RPAR { EitherResult ($3 :: $4) @@ $sloc } script : | list(cmd) EOF { List.concat $1 } diff --git a/interpreter/util/lib.ml b/interpreter/util/lib.ml index 8f9b6fe7e..0d9308994 100644 --- a/interpreter/util/lib.ml +++ b/interpreter/util/lib.ml @@ -104,9 +104,15 @@ struct let index_of x = index_where ((=) x) - let rec pairwise f = function + let rec map3 f xs ys zs = + match xs, ys, zs with + | [], [], [] -> [] + | x::xs', y::ys', z::zs' -> f x y z :: map3 f xs' ys' zs' + | _ -> raise (Invalid_argument "Lib.List.map3") + + let rec map_pairwise f = function | [] -> [] - | x1::x2::xs -> f x1 x2 :: pairwise f xs + | x1::x2::xs -> f x1 x2 :: map_pairwise f xs | _ -> failwith "pairwise" end diff --git a/interpreter/util/lib.mli b/interpreter/util/lib.mli index a4f4d1469..48c124349 100644 --- a/interpreter/util/lib.mli +++ b/interpreter/util/lib.mli @@ -25,7 +25,9 @@ sig val index_of : 'a -> 'a list -> int option val index_where : ('a -> bool) -> 'a list -> int option - val pairwise : ('a -> 'a -> 'b) -> 'a list -> 'b list + + val map3 : ('a -> 'b -> 'c -> 'd) -> 'a list -> 'b list -> 'c list -> 'd list + val map_pairwise : ('a -> 'a -> 'b) -> 'a list -> 'b list end module List32 : diff --git a/interpreter/valid/match.ml b/interpreter/valid/match.ml index 5b46b82f7..77c9c4f57 100644 --- a/interpreter/valid/match.ml +++ b/interpreter/valid/match.ml @@ -26,6 +26,12 @@ and top_of_heap_type c = function | VarHT (StatX x) -> top_of_str_type c (expand_def_type (lookup c x)) | VarHT (RecX _) | BotHT -> assert false +let top_of_val_type c = function + | NumT _ as t -> t + | VecT _ as t -> t + | RefT (_, ht) -> RefT (Null, top_of_heap_type c ht) + | BotT -> BotT (* well.. *) + let rec bot_of_str_type c st = bot_of_heap_type c (abs_of_str_type c st) diff --git a/interpreter/valid/match.mli b/interpreter/valid/match.mli index 5f0c96398..c9b3a06a0 100644 --- a/interpreter/valid/match.mli +++ b/interpreter/valid/match.mli @@ -8,6 +8,7 @@ type context = def_type list (* Extremas *) +val top_of_val_type : context -> val_type -> val_type val top_of_heap_type : context -> heap_type -> heap_type val bot_of_heap_type : context -> heap_type -> heap_type val top_of_str_type : context -> str_type -> heap_type diff --git a/interpreter/valid/valid.ml b/interpreter/valid/valid.ml index 17e73080d..511c34b09 100644 --- a/interpreter/valid/valid.ml +++ b/interpreter/valid/valid.ml @@ -876,6 +876,10 @@ let rec check_instr (c : context) (e : instr) (s : infer_result_type) : infer_in let t = VecT (type_vec binop) in [t; t] --> [t], [] + | VecTernary ternop -> + let t = VecT (type_vec ternop) in + [t; t; t] --> [t], [] + | VecCompare relop -> let t = VecT (type_vec relop) in [t; t] --> [t], [] diff --git a/proposals/relaxed-simd/Entropy.md b/proposals/relaxed-simd/Entropy.md new file mode 100644 index 000000000..52898e233 --- /dev/null +++ b/proposals/relaxed-simd/Entropy.md @@ -0,0 +1,57 @@ +# Entropy, Compat and Usage patterns + +The Relaxed SIMD proposal adds new operations that take advantage of the underlying hardware for accelerated performance by loosening the portability constraints of fixed-width SIMD, or adding new instructions that introduce local non-determinisim. This introduces a few potential risks specifically for browser engine implementations, though it’s possible that one or more of these might be applicable to other environments: +* Identifying underlying characteristics of the device (processor information if the engine that implements this proposal used the most optimal lowerings) +* Possibly identifying information about the browser currently in use (i.e. if a sufficiently motivated user writes a hand tuned Wasm function when engines use different lowerings for instructions) +* Compatibility risks, i.e. if code compiled for one browser works differently on a different browser (This is specific to observable behavior changes, and not performance differences) + +This document is an attempt to collate information that is already available in the issues filed for each of the operations on fingerprinting risk, quantify the risk for fingerprinting/compat issues and provide some information about usage patterns. + +## Instruction summary + +* **relaxed i8x16.swizzle**
+ *Entropy exposed:* Differences between x86/ARM
+ *Deterministic lowering:* Available
+ *Compat risk:* Low, as the differences exposed are for out of range indices
+ [*Issue link*](https://github.com/WebAssembly/relaxed-simd/issues/22) +* **relaxed i32x4.trunc_{f32x4, f64x2} operations**
+ *Entropy exposed:* Differences between x86/ARM
+ *Deterministic lowering:* Available
+ *Compat risk:* Low, as the differing behavior is for out of range values, and NaNs
+ [*Issue link*](https://github.com/WebAssembly/relaxed-simd/issues/21)
+* **qfma, qfmns**
+ *Entropy exposed:* Differences between hardware that has native FMA support, and hardware that does not.
+ *Deterministic lowering:* Not available, but depending on underlying hardware, the results can only be fused, or unfused.
+ *Compat risk:* Potentially divergent behavior based on hardware FMA support
+ *Mitigating reasons to include:* Most modern hardware do have native FMA support, performance wins are significant, operations are difficult to emulate
+ [*Issue link*](https://github.com/WebAssembly/simd/pull/79)
+* **{i8x16, i16x8, i32x4, i64x2}.laneselect**
+ *Entropy exposed:* x86/ARM
+ *Deterministic lowering:* Available
+ *Compat risk:* Medium, architectures vary on which bit is used for lane selection
+ [*Issue link*](https://github.com/WebAssembly/relaxed-simd/issues/17)
+* **{f32x4, f64x2}.{min,max}**
+ *Entropy exposed:* x86/ARM
+ *Deterministic lowering:* Available
+ *Compat risk:* Low, varying outputs when one of the inputs is NaN, or +0, -0
+ [*Issue link*](https://github.com/WebAssembly/relaxed-simd/issues/33)
+* **I16x8.relaxed_q15mulr_s**
+ *Entropy exposed:* x86/ARM
+ *Deterministic lowering:* Available
+ *Compat risk:* Low, different behaviors only in the overflow case
+ [*Issue link*](https://github.com/WebAssembly/relaxed-simd/issues/40)
+* **Dot product instructions**
+ *Entropy exposed:* x86/ARM, and whether the Dot product extension is supported in the most optimal codegen
+ *Deterministic lowering:* Available, but only when not using the most optimal codegen
+ *Compat risk:* Medium for architectures that support the Dot product extensions as they vary in saturating/wrapping behavior of intermediate results
*Mitigating reasons to include:* [Performance](https://docs.google.com/presentation/d/1xlyO1ly2Fbo2Up5ZuV_BTSwiNpCwPygag09XQRjclSA/edit#slide=id.g1fee95a4c4f_0_0)
+ [*Issue link*](https://github.com/WebAssembly/relaxed-simd/issues/52) + +## Usage patterns + +One of the things to note here, for compat especially, is to lay out the usage patterns, or specifically when these instructions are generated. As the proposal is still in the experimental state, the only way to currently experiment with these instructions are to call the clang built-ins directly, and experiment with them on an engine that has these instructions available as a prototype. + +In the future, these can be invoked using: + * Wasm Intrinsic header files + * Possibly in the autovectorizer, but only when flags like `-ffast-math` are used, that assume inputs and/or results are not NaNs or +-Infs. + +The thing to note here is that by either explicitly calling the intrinsics, or using specific compiler flags is a high enough threshold that this type of local non-determinism is not something a user would encounter by default, i.e. these instructions can only be used deliberately, and the proposal assumes a specialized usage. diff --git a/proposals/relaxed-simd/Overview.md b/proposals/relaxed-simd/Overview.md new file mode 100644 index 000000000..8fe79bfec --- /dev/null +++ b/proposals/relaxed-simd/Overview.md @@ -0,0 +1,357 @@ +# Relaxed SIMD proposal + +## Summary + +This proposal adds a set of useful SIMD instructions that introduce local +non-determinism (where the results of the instructions may vary based on +hardware support). + +## Motivation + +Applications running on Wasm SIMD cannot take full advantage of hardware +capabilities. There are 3 reasons: + +1. Instruction depends on hardware support +2. Approximate instructions that are underspecified in hardware +3. Some SIMD instructions penalize particular architecture + +See [these +slides](https://docs.google.com/presentation/d/1Qnx0nbNTRYhMONLuKyygEduCXNOv3xtWODfXfYokx1Y/edit?usp=sharing) +for more details. + +## Overview + +Broadly, there are three categories of instructions that fit into the Relaxed SIMD proposal: + +1. Integer instructions where the inputs are interpreted differently (e.g. + swizzle, 4-D dot-product) +2. Floating-point instructions whose behavior for out-of-range and NaNs differ + (e.g. float-to-int conversions, float min/max) +3. Floating-point instructions where the precision or order of operations + differ (e.g. FMA, reciprocal instructions, sum reduction) + +Example of some instructions we would like to add: + +- Fused Multiply Add (single rounding if hardware supports it, double rounding if not) +- Approximate reciprocal/reciprocal sqrt +- Relaxed Swizzle (implementation defined out of bounds behavior) +- Relaxed Rounding Q-format Multiplication (optional saturation) + +### Consistency + +This proposal introduces non-deterministic instructions - given the same +inputs, two calls to the same instruction can return different results. For +example: + +```wast +(module + (func (param v128 v128 v128) + (f32x4.qfma (local.get 0) (local.get 1) (local.get 2)) ;; (1) + ;; some other computation + (f32x4.qfma (local.get 0) (local.get 1) (local.get 2)))) ;; (2) +``` + +The same instruction at `(1)` and `(2)`, when given the same inputs, can return +two different results. This is compliant as the instruction is +non-deterministic, though unlikely on certain embeddings like the Web (where +the same implementation for `f32x4.qfma` is likely to be used for all calls to +that instruction). One can imagine splitting an application's module and +running them on multiple runtimes, where the runtimes produce +different results - this can be surprising to the application. + +The specification is updated with the idea of "Relaxed operations": + +> Some operators are host-dependent, because the set of possible results may +> depend on properties of the host environment (such as hardware). Technically, +> each such operator produces a fixed-size list of sets of allowed values. For +> each execution of the operator in the same environment, only values from the +> set at the same position in the list are returned, i.e., each environment +> globally chooses a fixed projection for each operator. + +## Instructions + +`IMPLEMENTATION_DEFINED_ONE_OF(...)` returns any one of the results in the argument list, depending on the implementation. + +### Relaxed swizzle + +- `relaxed i8x16.swizzle(a : v128, s : v128) -> v128` + +`relaxed i8x16.swizzle(a, s)` selects lanes from `a` using indices in `s`, +indices in the range `[0,15]` will select the `i`-th element of `a`, the result +for any out of range indices is implementation-defined (i.e. if the index is +`[16-255]`. + +```python +def relaxed_i8x16_swizzle(a : i8x16, s : i8x16): + result = [] + for i in range(16): + if s[i] < 16: + result[i] = a[s[i]] + else if s[i] < 128: + result[i] = IMPLEMENTATION_DEFINED_ONE_OF(0, a[s[i]%16]) + else: + result[i] = 0 + return result +``` + +### Float/Double to int conversions + +- `relaxed i32x4.trunc_f32x4_s` (relaxed version of `i32x4.trunc_sat_f32x4_s`) +- `relaxed i32x4.trunc_f32x4_u` (relaxed version of `i32x4.trunc_sat_f32x4_u`) +- `relaxed i32x4.trunc_f64x2_s_zero` (relaxed version of `i32x4.trunc_sat_f64x2_s_zero`) +- `relaxed i32x4.trunc_f64x2_u_zero` (relaxed version of `i32x4.trunc_sat_f64x2_u_zero`) + +These instructions have the same behavior as the non-relaxed instructions for +lanes that are in the range of an `i32` (signed or unsigned depending on the +instruction). The result of lanes which contain NaN is implementation defined, +either 0 or `INT32_MAX` for signed and `UINT32_MAX` for unsigned. The result of +lanes which are out of bounds of `INT32` or `UINT32` is implementation defined, +it can be either the saturated result or `INT32_MAX` for signed and `UINT32_MAX` +for unsigned. + +```python +def relaxed_i32x4_trunc_f32x4_s(a : f32x4) -> i32x4: + result = [0, 0, 0, 0] + for i in range(4): + if isnan(a[i]): + result[i] = IMPLEMENTATION_DEFINED_ONE_OF(0, INT32_MIN) + continue + r = truncate(a[i]) + if r < INT32_MIN: + result[i] = INT32_MIN + elif r > INT32_MAX + result[i] = IMPLEMENTATION_DEFINED_ONE_OF(INT32_MIN, INT32_MAX) + else: + result[i] = r + +def relaxed_i32x4_trunc_f32x4_u(a : f32x4) -> i32x4: + result = [0, 0, 0, 0] + for i in range(4): + if isnan(a[i]): + result[i] = IMPLEMENTATION_DEFINED_ONE_OF(0, UINT32_MAX) + continue + r = truncate(a[i]) + if r < UINT32_MIN: + result[i] = IMPLEMENTATION_DEFINED_ONE_OF(UINT32_MIN, UINT32_MAX) + elif r > UINT32_MAX: + result[i] = UINT32_MAX + else: + result[i] = r + +def relaxed_i32x4_trunc_f64x2_zero_s(a : f64x2) -> i32x4: + result = [0, 0, 0, 0] + for i in range(2): + if isnan(a[i]): + result[i] = IMPLEMENTATION_DEFINED_ONE_OF(0, INT32_MIN) + continue + r = truncate(a[i]) + if r < INT32_MIN: + result[i] = INT32_MIN + elif r > INT32_MAX + result[i] = IMPLEMENTATION_DEFINED_ONE_OF(INT32_MIN, INT32_MAX) + else: + result[i] = r + +def relaxed_i32x4_trunc_f64x2_zero_u(a : f64x2) -> i32x4: + result = [0, 0, 0, 0] + for i in range(2): + if isnan(a[i]): + result[i] = IMPLEMENTATION_DEFINED_ONE_OF(0, UINT32_MAX) + continue + r = truncate(a[i]) + if r < UINT32_MIN: + result[i] = IMPLEMENTATION_DEFINED_ONE_OF(UINT32_MIN, UINT32_MAX) + elif r > UINT32_MAX: + result[i] = UINT32_MAX + else: + result[i] = r +``` + +### Relaxed fused multiply-add and fused negative multiply-add + +- `relaxed f32x4.madd` +- `relaxed f32x4.nmadd` +- `relaxed f64x2.madd` +- `relaxed f64x2.nmadd` + +All the instructions take 3 operands, `a`, `b`, `c`, perform `a * b + c` or `-(a * b) + c`: + +- `relaxed f32x4.madd(a, b, c) = a * b + c` +- `relaxed f32x4.nmadd(a, b, c) = -(a * b) + c` +- `relaxed f64x2.madd(a, b, c) = a * b + c` +- `relaxed f64x2.nmadd(a, b, c) = -(a * b) + c` + +where: + +- the intermediate `a * b` is be rounded first, and the final result rounded again (for a total of 2 roundings), or +- the entire expression evaluated with higher precision and then only rounded once (if supported by hardware). + +### Relaxed laneselect + +- `i8x16.laneselect(a: v128, b: v128, m: v128) -> v128` +- `i16x8.laneselect(a: v128, b: v128, m: v128) -> v128` +- `i32x4.laneselect(a: v128, b: v128, m: v128) -> v128` +- `i64x2.laneselect(a: v128, b: v128, m: v128) -> v128` + +Select lanes from `a` or `b` based on masks in `m`. If each lane-sized mask in `m` has all bits set or all bits unset, these instructions behave the same as `v128.bitselect`. Otherwise, the result is implementation defined. + +```python +def laneselect(a : v128, b : v128, m: v128, lanes : int): + result = [0] * lanes + for i in range(lanes): + mask = m[i] + if mask == ~0: + result[i] = a[i] + elif mask == 0: + result[i] = b[i] + else topbit(mask) == 1: + result[i] = IMPLEMENTATION_DEFINED_ONE_OF(bitselect(a[i], b[i], mask), a[i]) + else: + result[i] = IMPLEMENTATION_DEFINED_ONE_OF(bitselect(a[i], b[i], mask), b[i]) + return result +``` + +### Relaxed min and max + +- `f32x4.min(a: v128, b: v128) -> v128` +- `f32x4.max(a: v128, b: v128) -> v128` +- `f64x2.min(a: v128, b: v128) -> v128` +- `f64x2.max(a: v128, b: v128) -> v128` + +Return the lane-wise minimum or maximum of two values. If either values is NaN, or the values are -0.0 and +0.0, the return value is implementation-defined. + +```python +def min_or_max(a : v128, b : v128, lanes : int, is_min : bool): + result = [] + for i in range(lanes): + if isNaN(a[i]) or isNaN(b[i]): + result[i] = IMPLEMENTATION_DEFINED_ONE_OF(a[i], b[i]) + elif (a[i] == -0.0 && b[i] == +0.0) or (a[i] == +0.0 && b[i] == -0.0): + result[i] = IMPLEMENTATION_DEFINED_ONE_OF(a[i], b[i]) + else: + result[i] = is_min ? min(a, b) : max(a, b) + return result +``` + +### Relaxed Rounding Q-format Multiplication + +- `i16x8.q15mulr_s(a: v128, b: v128) -> v128` + +Returns the multiplication of 2 fixed-point numbers in Q15 format. If both +inputs are `INT16_MIN`, the result overflows, and the return value is +implementation defined (either `INT16_MIN` or `INT16_MAX`). + +```python +def q15mulr(a, b): + result = [] + for i in range(lanes): + if (a[i] == INT16_MIN && b[i] == INT16_MIN): + result[i] = IMPLEMENTATION_DEFINED_ONE_OF(INT16_MIN, INT16_MAX) + else: + result[i] = (a[i] * b[i] + 0x4000) >> 15 + return result +``` + +### Relaxed integer dot product + +- `i16x8.dot_i8x16_i7x16_s(a: v128, b: v128) -> v128` +- `i32x4.dot_i8x16_i7x16_add_s(a: v128, b:v128, c:v128) -> v128` + +Returns the multiplication of 8-bit elements (signed or unsigned) by 7-bit +elements (unsigned) with accumulation of adjacent products. The `i32x4` versions +allows for accumulation into another vector. + +When the second operand of the product has the high bit set in a lane, that +lane's result is implementation defined. + +```python +def i16x8_dot_i8x16_i7x16_s(a, b): + intermediate = [] + result = [] + for i in range(16): + if (b[i] & 0x80): + lhs = as_signed(a[i]) + rhs = IMPLEMENTATION_DEFINED_ONE_OF(as_signed(b[i]), as_unsigned(b[i])) + intermediate[i] = lhs * rhs + else: + intermediate[i] = as_signed(a[i]) * b[i] + for i in range(0, 16, 2): + result[i/2] = IMPLEMENTATION_DEFINED_ONE_OF( + intermediate[i] + intermediate[i+1], + saturate(intermediate[i] + intermediate[i+1])) + +def i32x4.dot_i8x16_i7x16_add_s(a, b, c): + intermediate = [] + tmp = [] + result = [] + for i in range(16): + if (b[i] & 0x80): + lhs = as_signed(a[i]) + rhs = IMPLEMENTATION_DEFINED_ONE_OF(as_signed(b[i]), as_unsigned(b[i])) + intermediate[i] = lhs * rhs + else: + intermediate[i] = as_signed(a[i]) * b[i] + + for i in range(0, 16, 2): + tmp[i/2] = IMPLEMENTATION_DEFINED_ONE_OF( + intermediate[i] + intermediate[i+1], + saturate(intermediate[i] + intermediate[i+1])) + + for i in range(0, 8, 2): + dst[i/4] = tmp[i] + tmp[i+1] + + for i in range(0, 4): + dst[i] += c[i] + +saturate(x) = min(INT16_MAX, max(INT16_MIN, x)) +``` + +## Binary format + +All opcodes have the `0xfd` prefix (same as SIMD proposal), which are omitted in the table below. +Opcodes `0x100` to `0x12F` (32 opcodes) are reserved for this proposal. + +Note: "prototype opcode" refers to the opcodes that were used in prototyping, which +where chosen to fit into the holes in the opcode space of SIMD proposal. Going +forward, the opcodes for relaxed-simd specification will be the ones in the +"opcode" column, and it will take some time for tools and engines to update. + +| instruction | opcode | prototype opcode | +| ------------------------------------- | -------------- | ---------------- | +| `i8x16.relaxed_swizzle` | 0x100 | 0xa2 | +| `i32x4.relaxed_trunc_f32x4_s` | 0x101 | 0xa5 | +| `i32x4.relaxed_trunc_f32x4_u` | 0x102 | 0xa6 | +| `i32x4.relaxed_trunc_f64x2_s_zero` | 0x103 | 0xc5 | +| `i32x4.relaxed_trunc_f64x2_u_zero` | 0x104 | 0xc6 | +| `f32x4.relaxed_madd` | 0x105 | 0xaf | +| `f32x4.relaxed_nmadd` | 0x106 | 0xb0 | +| `f64x2.relaxed_madd` | 0x107 | 0xcf | +| `f64x2.relaxed_nmadd` | 0x108 | 0xd0 | +| `i8x16.relaxed_laneselect` | 0x109 | 0xb2 | +| `i16x8.relaxed_laneselect` | 0x10a | 0xb3 | +| `i32x4.relaxed_laneselect` | 0x10b | 0xd2 | +| `i64x2.relaxed_laneselect` | 0x10c | 0xd3 | +| `f32x4.relaxed_min` | 0x10d | 0xb4 | +| `f32x4.relaxed_max` | 0x10e | 0xe2 | +| `f64x2.relaxed_min` | 0x10f | 0xd4 | +| `f64x2.relaxed_max` | 0x110 | 0xee | +| `i16x8.relaxed_q15mulr_s` | 0x111 | 0x111 | +| `i16x8.relaxed_dot_i8x16_i7x16_s` | 0x112 | 0x112 | +| `i32x4.relaxed_dot_i8x16_i7x16_add_s` | 0x113 | 0x113 | +| Reserved | 0x114 - 0x12F | | + +## References + +- Poll for phase 1 + [presentation](https://docs.google.com/presentation/d/1Qnx0nbNTRYhMONLuKyygEduCXNOv3xtWODfXfYokx1Y/edit?usp=sharing) + and [meeting + notes](https://github.com/WebAssembly/meetings/blob/master/main/2021/CG-03-16.md) +- Poll for phase 2 + [slides](https://docs.google.com/presentation/d/1zyRqfgGU7HdoVw9QiKaVYifozbytPhNLHUW9jQjPzLk/edit?usp=sharing) + and [meeting + notes](https://github.com/WebAssembly/meetings/blob/main/main/2021/CG-11-09.md#update-on-relaxed-simd-fpenv-discussions-and-poll-for-phase-2-zhi-an-ng-15-min) +- Poll for phase 3 + [slides](https://docs.google.com/presentation/d/1ofBkgbW2AjYM4oayjTPTH3PCbyEn6W-q3GN67qSEigQ/edit?usp=sharing) + and [meeting + notes](https://github.com/WebAssembly/meetings/blob/main/main/2022/CG-04-12.md) +- [SIMD proposal](https://github.com/WebAssembly/simd) diff --git a/test/core/relaxed-simd/i16x8_relaxed_q15mulr_s.wast b/test/core/relaxed-simd/i16x8_relaxed_q15mulr_s.wast new file mode 100644 index 000000000..00f901cbc --- /dev/null +++ b/test/core/relaxed-simd/i16x8_relaxed_q15mulr_s.wast @@ -0,0 +1,28 @@ +;; Tests for i16x8.relaxed_q15mulr_s. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "i16x8.relaxed_q15mulr_s") (param v128 v128) (result v128) (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1))) + + (func (export "i16x8.relaxed_q15mulr_s_cmp") (param v128 v128) (result v128) + (i16x8.eq + (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1)) + (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1)))) +) + +;; INT16_MIN = -32768 +(assert_return (invoke "i16x8.relaxed_q15mulr_s" + (v128.const i16x8 -32768 -32767 32767 0 0 0 0 0) + (v128.const i16x8 -32768 -32768 32767 0 0 0 0 0)) + ;; overflows, return either INT16_MIN or INT16_MAX + (either (v128.const i16x8 -32768 32767 32766 0 0 0 0 0) + (v128.const i16x8 32767 32767 32766 0 0 0 0 0))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "i16x8.relaxed_q15mulr_s_cmp" + (v128.const i16x8 -32768 -32767 32767 0 0 0 0 0) + (v128.const i16x8 -32768 -32768 32767 0 0 0 0 0)) + ;; overflows, return either INT16_MIN or INT16_MAX + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + diff --git a/test/core/relaxed-simd/i32x4_relaxed_trunc.wast b/test/core/relaxed-simd/i32x4_relaxed_trunc.wast new file mode 100644 index 000000000..cca3ecb95 --- /dev/null +++ b/test/core/relaxed-simd/i32x4_relaxed_trunc.wast @@ -0,0 +1,124 @@ +;; Tests for i32x4.relaxed_trunc_f32x4_s, i32x4.relaxed_trunc_f32x4_u, i32x4.relaxed_trunc_f64x2_s_zero, and i32x4.relaxed_trunc_f64x2_u_zero. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "i32x4.relaxed_trunc_f32x4_s") (param v128) (result v128) (i32x4.relaxed_trunc_f32x4_s (local.get 0))) + (func (export "i32x4.relaxed_trunc_f32x4_u") (param v128) (result v128) (i32x4.relaxed_trunc_f32x4_u (local.get 0))) + (func (export "i32x4.relaxed_trunc_f64x2_s_zero") (param v128) (result v128) (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0))) + (func (export "i32x4.relaxed_trunc_f64x2_u_zero") (param v128) (result v128) (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0))) + + (func (export "i32x4.relaxed_trunc_f32x4_s_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f32x4_s (local.get 0)) + (i32x4.relaxed_trunc_f32x4_s (local.get 0)))) + (func (export "i32x4.relaxed_trunc_f32x4_u_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f32x4_u (local.get 0)) + (i32x4.relaxed_trunc_f32x4_u (local.get 0)))) + (func (export "i32x4.relaxed_trunc_f64x2_s_zero_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0)) + (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0)))) + (func (export "i32x4.relaxed_trunc_f64x2_u_zero_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0)) + (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0)))) +) + +;; Test some edge cases around min/max to ensure that the instruction either +;; saturates correctly or returns INT_MIN. +;; +;; Note, though, that INT_MAX itself is not tested. The value for INT_MAX is +;; 2147483647 but that is not representable in a `f32` since it requires 31 bits +;; when a f32 has only 24 bits available. This means that the closest integers +;; to INT_MAX which can be represented are 2147483520 and 2147483648, meaning +;; that the INT_MAX test case cannot be tested. +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s" + ;; INT32_MIN INT32_MAX + (v128.const f32x4 -2147483648.0 -2147483904.0 2.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (either (v128.const i32x4 -2147483648 -2147483648 2 2147483647) + (v128.const i32x4 -2147483648 -2147483648 2 -2147483648))) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or INT32_MIN + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0x80000000 0x80000000 0x80000000 0x80000000))) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u" + ;; UINT32_MIN UINT32_MIN-1 saturate or UINT32_MAX + (either (v128.const i32x4 0 0 4294967040 0xffffffff) + (v128.const i32x4 0 0xffffffff 4294967040 0xffffffff))) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or UINT32_MAX + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0xffffffff 0xffffffff 0xffffffff 0xffffffff))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero" + (v128.const f64x2 -2147483904.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (either (v128.const i32x4 -2147483648 2147483647 0 0) + (v128.const i32x4 -2147483648 -2147483648 0 0))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero" + (v128.const f64x2 nan -nan)) + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0x80000000 0x80000000 0 0))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero" + (v128.const f64x2 -1.0 4294967296.0)) + ;; out of range -> saturate or UINT32_MAX + (either (v128.const i32x4 0 0xffffffff 0 0) + (v128.const i32x4 0xffffffff 0xffffffff 0 0))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero" + (v128.const f64x2 nan -nan)) + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0 0 0xffffffff 0xffffffff))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s_cmp" + ;; INT32_MIN INT32_MAX + (v128.const f32x4 -2147483648.0 -2147483904.0 2147483647.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s_cmp" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or INT32_MIN + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u_cmp" + ;; UINT32_MIN UINT32_MIN-1 saturate or UINT32_MAX + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u_cmp" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or UINT32_MAX + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero_cmp" + (v128.const f64x2 -2147483904.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero_cmp" + (v128.const f64x2 nan -nan)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero_cmp" + (v128.const f64x2 -1.0 4294967296.0)) + ;; out of range -> saturate or UINT32_MAX + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero_cmp" + (v128.const f64x2 nan -nan)) + (v128.const i32x4 -1 -1 -1 -1)) diff --git a/test/core/relaxed-simd/i8x16_relaxed_swizzle.wast b/test/core/relaxed-simd/i8x16_relaxed_swizzle.wast new file mode 100644 index 000000000..f1bcb4552 --- /dev/null +++ b/test/core/relaxed-simd/i8x16_relaxed_swizzle.wast @@ -0,0 +1,45 @@ +;; Tests for relaxed i8x16 swizzle. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "i8x16.relaxed_swizzle") (param v128 v128) (result v128) (i8x16.relaxed_swizzle (local.get 0) (local.get 1))) + + (func (export "i8x16.relaxed_swizzle_cmp") (param v128 v128) (result v128) + (i8x16.eq + (i8x16.relaxed_swizzle (local.get 0) (local.get 1)) + (i8x16.relaxed_swizzle (local.get 0) (local.get 1)))) +) + +(assert_return (invoke "i8x16.relaxed_swizzle" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)) + (either (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))) + +;; out of range, returns 0 or modulo 15 if < 128 +(assert_return (invoke "i8x16.relaxed_swizzle" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)) + (either (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))) + +;; out of range, returns 0 if >= 128 +(assert_return (invoke "i8x16.relaxed_swizzle" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 128 129 130 131 132 133 134 135 248 249 250 251 252 253 254 255)) + (either (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +;; out of range, returns 0 or modulo 15 if < 128 +(assert_return (invoke "i8x16.relaxed_swizzle_cmp" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)) + (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1)) + +;; out of range, returns 0 if >= 128 +(assert_return (invoke "i8x16.relaxed_swizzle_cmp" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 128 129 130 131 132 133 134 135 248 249 250 251 252 253 254 255)) + (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1)) diff --git a/test/core/relaxed-simd/relaxed_dot_product.wast b/test/core/relaxed-simd/relaxed_dot_product.wast new file mode 100644 index 000000000..48714b87b --- /dev/null +++ b/test/core/relaxed-simd/relaxed_dot_product.wast @@ -0,0 +1,107 @@ +;; Tests for relaxed dot products. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "i16x8.relaxed_dot_i8x16_i7x16_s") (param v128 v128) (result v128) (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1))) + (func (export "i32x4.relaxed_dot_i8x16_i7x16_add_s") (param v128 v128 v128) (result v128) (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2))) + + (func (export "i16x8.relaxed_dot_i8x16_i7x16_s_cmp") (param v128 v128) (result v128) + (i16x8.eq + (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1)) + (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1)))) + (func (export "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp") (param v128 v128 v128) (result v128) + (i16x8.eq + (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2)) + (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2)))) +) + +;; Simple values to ensure things are functional. +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)) + (v128.const i16x8 1 13 41 85 145 221 313 421)) + +;; Test max and min i8 values; +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s" + (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i16x8 -32512 32258 0 0 0 0 0 0)) + +;; signed * unsigned : -128 * 129 * 2 = -33,024 saturated to -32,768 +;; signed * signed : -128 * -127 * 2 = 32,512 +;; unsigned * unsigned : 128 * 129 * 2 = 33,024 +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s" + (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0)) + (either + (v128.const i16x8 -32768 0 0 0 0 0 0 0) + (v128.const i16x8 32512 0 0 0 0 0 0 0) + (v128.const i16x8 33024 0 0 0 0 0 0 0))) + +;; Simple values to ensure things are functional. +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i32x4 0 1 2 3)) + ;; intermediate result is [14, 126, 366, 734] + (v128.const i32x4 14 127 368 737)) + +;; Test max and min i8 values; +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s" + (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + ;; intermediate result is [-65024, 64516, 0, 0] + (v128.const i32x4 -65023 64518 3 4)) + +;; signed * unsigned : -128 * 129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI +;; signed * unsigned with intermediate saturation : +;; (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW) +;; -32768 + -32768 = -65536 (+ 1) +;; signed * signed : -128 * -127 * 4 = 65,024 (+ 1) +;; unsigned * unsigned : 128 * 129 * 2 = 66,048 (+ 1) +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s" + (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + (either + (v128.const i32x4 -66047 2 3 4) + (v128.const i32x4 -65535 2 3 4) + (v128.const i32x4 65025 2 3 4) + (v128.const i32x4 66049 2 3 4))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +;; Test max and min i8 values; +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s_cmp" + (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + +;; Test max and min i8 values; +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp" + (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + ;; intermediate result is [-65024, 64516, 0, 0] + (v128.const i32x4 -1 -1 -1 -1)) + +;; signed * unsigned : -128 * 129 * 2 = -33,024 saturated to -32,768 +;; signed * signed : -128 * -127 * 2 = 32,512 +;; unsigned * unsigned : 128 * 129 * 2 = 33,024 +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s_cmp" + (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + +;; signed * unsigned : -128 * 129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI +;; signed * unsigned with intermediate saturation : +;; (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW) +;; -32768 + -32768 = -65536 (+ 1) +;; signed * signed : -128 * -127 * 4 = 65,024 (+ 1) +;; unsigned * unsigned : 128 * 129 * 2 = 66,048 (+ 1) +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp" + (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + (v128.const i32x4 -1 -1 -1 -1)) diff --git a/test/core/relaxed-simd/relaxed_laneselect.wast b/test/core/relaxed-simd/relaxed_laneselect.wast new file mode 100644 index 000000000..10913816b --- /dev/null +++ b/test/core/relaxed-simd/relaxed_laneselect.wast @@ -0,0 +1,103 @@ +;; Tests for i8x16.relaxed_laneselect, i16x8.relaxed_laneselect, i32x4.relaxed_laneselect, and i64x2.relaxed_laneselect. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "i8x16.relaxed_laneselect") (param v128 v128 v128) (result v128) (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + (func (export "i16x8.relaxed_laneselect") (param v128 v128 v128) (result v128) (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + (func (export "i32x4.relaxed_laneselect") (param v128 v128 v128) (result v128) (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + (func (export "i64x2.relaxed_laneselect") (param v128 v128 v128) (result v128) (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + + (func (export "i8x16.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i8x16.eq + (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) + (func (export "i16x8.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i16x8.eq + (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) + (func (export "i32x4.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i32x4.eq + (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) + (func (export "i64x2.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i64x2.eq + (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) +) + +(assert_return (invoke "i8x16.relaxed_laneselect" + (v128.const i8x16 0 1 0x12 0x12 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 0x34 0x34 20 21 22 23 24 25 26 27 28 29 30 31) + (v128.const i8x16 0xff 0 0xf0 0x0f 0 0 0 0 0 0 0 0 0 0 0 0)) + (either (v128.const i8x16 0 17 0x14 0x32 20 21 22 23 24 25 26 27 28 29 30 31) + (v128.const i8x16 0 17 0x12 0x34 20 21 22 23 24 25 26 27 28 29 30 31))) + +(assert_return (invoke "i16x8.relaxed_laneselect" + (v128.const i16x8 0 1 0x1234 0x1234 4 5 6 7) + (v128.const i16x8 8 9 0x5678 0x5678 12 13 14 15) + (v128.const i16x8 0xffff 0 0xff00 0x00ff 0 0 0 0)) + (either (v128.const i16x8 0 9 0x1278 0x5634 12 13 14 15) + (v128.const i16x8 0 9 0x1234 0x5678 12 13 14 15))) + +;; special case for i16x8 to allow pblendvb +(assert_return (invoke "i16x8.relaxed_laneselect" + (v128.const i16x8 0 1 0x1234 0x1234 4 5 6 7) + (v128.const i16x8 8 9 0x5678 0x5678 12 13 14 15) + (v128.const i16x8 0xffff 0 0xff00 0x0080 0 0 0 0)) ;; 0x0080 is the special case + (either (v128.const i16x8 0 9 0x1278 0x5678 12 13 14 15) ;; bitselect + (v128.const i16x8 0 9 0x1234 0x5678 12 13 14 15) ;; top bit of i16 lane examined + (v128.const i16x8 0 9 0x1278 0x5634 12 13 14 15) ;; top bit of each byte + )) + +(assert_return (invoke "i32x4.relaxed_laneselect" + (v128.const i32x4 0 1 0x12341234 0x12341234) + (v128.const i32x4 4 5 0x56785678 0x56785678) + (v128.const i32x4 0xffffffff 0 0xffff0000 0x0000ffff)) + (either (v128.const i32x4 0 5 0x12345678 0x56781234) + (v128.const i32x4 0 5 0x12341234 0x56785678))) + +(assert_return (invoke "i64x2.relaxed_laneselect" + (v128.const i64x2 0 1) + (v128.const i64x2 2 3) + (v128.const i64x2 0xffffffffffffffff 0)) + (either (v128.const i64x2 0 3) + (v128.const i64x2 0 3))) + +(assert_return (invoke "i64x2.relaxed_laneselect" + (v128.const i64x2 0x1234123412341234 0x1234123412341234) + (v128.const i64x2 0x5678567856785678 0x5678567856785678) + (v128.const i64x2 0xffffffff00000000 0x00000000ffffffff)) + (either (v128.const i64x2 0x1234123456785678 0x5678567812341234) + (v128.const i64x2 0x1234123412341234 0x5678567856785678))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "i8x16.relaxed_laneselect_cmp" + (v128.const i8x16 0 1 0x12 0x12 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 0x34 0x34 20 21 22 23 24 25 26 27 28 29 30 31) + (v128.const i8x16 0xff 0 0xf0 0x0f 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1)) + +(assert_return (invoke "i16x8.relaxed_laneselect_cmp" + (v128.const i16x8 0 1 0x1234 0x1234 4 5 6 7) + (v128.const i16x8 8 9 0x5678 0x5678 12 13 14 15) + (v128.const i16x8 0xffff 0 0xff00 0x00ff 0 0 0 0)) + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_laneselect_cmp" + (v128.const i32x4 0 1 0x12341234 0x12341234) + (v128.const i32x4 4 5 0x56785678 0x56785678) + (v128.const i32x4 0xffffffff 0 0xffff0000 0x0000ffff)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i64x2.relaxed_laneselect_cmp" + (v128.const i64x2 0 1) + (v128.const i64x2 2 3) + (v128.const i64x2 0xffffffffffffffff 0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "i64x2.relaxed_laneselect_cmp" + (v128.const i64x2 0x1234123412341234 0x1234123412341234) + (v128.const i64x2 0x5678567856785678 0x5678567856785678) + (v128.const i64x2 0xffffffff00000000 0x00000000ffffffff)) + (v128.const i64x2 -1 -1)) diff --git a/test/core/relaxed-simd/relaxed_madd_nmadd.wast b/test/core/relaxed-simd/relaxed_madd_nmadd.wast new file mode 100644 index 000000000..187b71d5a --- /dev/null +++ b/test/core/relaxed-simd/relaxed_madd_nmadd.wast @@ -0,0 +1,224 @@ +;; Tests for f32x4.relaxed_madd, f32x4.relaxed_nmadd, f64x2.relaxed_madd, and f64x2.relaxed_nmadd. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "f32x4.relaxed_madd") (param v128 v128 v128) (result v128) (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2))) + (func (export "f32x4.relaxed_nmadd") (param v128 v128 v128) (result v128) (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))) + (func (export "f64x2.relaxed_nmadd") (param v128 v128 v128) (result v128) (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))) + (func (export "f64x2.relaxed_madd") (param v128 v128 v128) (result v128) (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2))) + + (func (export "f32x4.relaxed_madd_cmp") (param v128 v128 v128) (result v128) + (f32x4.eq + (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2)) + (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))) + (func (export "f32x4.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128) + (f32x4.eq + (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)) + (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))) + (func (export "f64x2.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128) + (f64x2.eq + (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)) + (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))) + (func (export "f64x2.relaxed_madd_cmp") (param v128 v128 v128) (result v128) + (f64x2.eq + (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2)) + (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))) +) + + +;; FLT_MAX == 0x1.fffffep+127 +;; FLT_MAX * 2 - FLT_MAX == +;; FLT_MAX (if fma) +;; 0 (if no fma) +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f32x4.relaxed_madd" + (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 ) + (v128.const f32x4 2.0 2.0 2.0 2.0) + (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127)) + (either (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127) + (v128.const f32x4 inf inf inf inf))) + +;; Special values for float: +;; x = 0x1.000004p+0 (1 + 2^-22) +;; y = 0x1.0002p+0 (1 + 2^-15) +;; z = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0) +;; = -0x1.000204p+0 +;; x.y = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = (0x1p-37) 2^-37 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f32x4.relaxed_madd" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37) + (v128.const f32x4 0 0 0 0))) +;; nmadd tests with negated x, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd" + (v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37) + (v128.const f32x4 0 0 0 0))) +;; nmadd tests with negated y, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37) + (v128.const f32x4 0 0 0 0))) + +;; DBL_MAX = 0x1.fffffffffffffp+1023 +;; DLB_MAX * 2 - DLB_MAX == +;; DLB_MAX (if fma) +;; 0 (if no fma) +;; form https://www.vinc17.net/software/fma-tests.c +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f64x2.relaxed_madd" + (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023) + (v128.const f64x2 2.0 2.0) + (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023)) + (either (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023) + (v128.const f64x2 inf inf))) + +;; Special values for double: +;; x = 0x1.00000004p+0 (1 + 2^-30) +;; y = 0x1.000002p+0 (1 + 2^-23) +;; z = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0) +;; = -0x1.00000204p+0 +;; x.y = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = 0x1p-53 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f64x2.relaxed_madd" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (either (v128.const f64x2 0x1p-53 0x1p-53) + (v128.const f64x2 0 0))) +;; nmadd tests with negated x, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd" + (v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (either (v128.const f64x2 0x1p-53 0x1p-53) + (v128.const f64x2 0 0))) +;; nmadd tests with negated y, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (either (v128.const f64x2 0x1p-53 0x1p-53) + (v128.const f64x2 0 0))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +;; FLT_MAX == 0x1.fffffep+127 +;; FLT_MAX * 2 - FLT_MAX == +;; FLT_MAX (if fma) +;; 0 (if no fma) +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f32x4.relaxed_madd_cmp" + (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 ) + (v128.const f32x4 2.0 2.0 2.0 2.0) + (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127)) + (v128.const i32x4 -1 -1 -1 -1)) + +;; Special values for float: +;; x = 0x1.000004p+0 (1 + 2^-22) +;; y = 0x1.0002p+0 (1 + 2^-15) +;; z = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0) +;; = -0x1.000204p+0 +;; x.y = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = (0x1p-37) 2^-37 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f32x4.relaxed_madd_cmp" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (v128.const i32x4 -1 -1 -1 -1)) +;; nmadd tests with negated x, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd_cmp" + (v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (v128.const i32x4 -1 -1 -1 -1)) +;; nmadd tests with negated y, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd_cmp" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (v128.const i32x4 -1 -1 -1 -1)) + +;; DBL_MAX = 0x1.fffffffffffffp+1023 +;; DLB_MAX * 2 - DLB_MAX == +;; DLB_MAX (if fma) +;; 0 (if no fma) +;; form https://www.vinc17.net/software/fma-tests.c +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f64x2.relaxed_madd_cmp" + (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023) + (v128.const f64x2 2.0 2.0) + (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023)) + (v128.const i64x2 -1 -1)) + +;; Special values for double: +;; x = 0x1.00000004p+0 (1 + 2^-30) +;; y = 0x1.000002p+0 (1 + 2^-23) +;; z = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0) +;; = -0x1.00000204p+0 +;; x.y = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = 0x1p-53 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f64x2.relaxed_madd_cmp" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (v128.const i64x2 -1 -1)) +;; nmadd tests with negated x, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd_cmp" + (v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (v128.const i64x2 -1 -1)) +;; nmadd tests with negated y, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd_cmp" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (v128.const i64x2 -1 -1)) + +;; Test that the non-deterministic choice of fusing and then rounding or +;; rounding multiple times in `relaxed_madd` is consistent throughout a +;; program's execution. +;; +;; This property is impossible to test exhaustively, so this is just a simple +;; smoke test for when the operands to a `relaxed_madd` are known statically +;; versus when they are dynamically supplied. This should, at least, catch +;; illegal constant-folding and -propagation by the compiler that leads to +;; inconsistent rounding behavior at compile time versus at run time. +;; +;; FLT_MAX == 0x1.fffffep+127 +;; FLT_MAX * 2 - FLT_MAX == +;; FLT_MAX (if fma) +;; 0 (if no fma) +;; from https://www.vinc17.net/software/fma-tests.c +(module + (func (export "test-consistent-nondeterminism") (param v128 v128 v128) (result v128) + (f32x4.eq + (f32x4.relaxed_madd (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 ) + (v128.const f32x4 2.0 2.0 2.0 2.0) + (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127)) + (f32x4.relaxed_madd (local.get 0) + (local.get 1) + (local.get 2)) + ) + ) +) +(assert_return (invoke "test-consistent-nondeterminism" + (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 ) + (v128.const f32x4 2.0 2.0 2.0 2.0) + (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127)) + (v128.const i32x4 -1 -1 -1 -1)) diff --git a/test/core/relaxed-simd/relaxed_min_max.wast b/test/core/relaxed-simd/relaxed_min_max.wast new file mode 100644 index 000000000..ac3ebb07c --- /dev/null +++ b/test/core/relaxed-simd/relaxed_min_max.wast @@ -0,0 +1,184 @@ +;; Tests for f32x4.min, f32x4.max, f64x2.min, and f64x2.max. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "f32x4.relaxed_min") (param v128 v128) (result v128) (f32x4.relaxed_min (local.get 0) (local.get 1))) + (func (export "f32x4.relaxed_max") (param v128 v128) (result v128) (f32x4.relaxed_max (local.get 0) (local.get 1))) + (func (export "f64x2.relaxed_min") (param v128 v128) (result v128) (f64x2.relaxed_min (local.get 0) (local.get 1))) + (func (export "f64x2.relaxed_max") (param v128 v128) (result v128) (f64x2.relaxed_max (local.get 0) (local.get 1))) + + (func (export "f32x4.relaxed_min_cmp") (param v128 v128) (result v128) + (i32x4.eq + (f32x4.relaxed_min (local.get 0) (local.get 1)) + (f32x4.relaxed_min (local.get 0) (local.get 1)))) + (func (export "f32x4.relaxed_max_cmp") (param v128 v128) (result v128) + (i32x4.eq + (f32x4.relaxed_max (local.get 0) (local.get 1)) + (f32x4.relaxed_max (local.get 0) (local.get 1)))) + (func (export "f64x2.relaxed_min_cmp") (param v128 v128) (result v128) + (i64x2.eq + (f64x2.relaxed_min (local.get 0) (local.get 1)) + (f64x2.relaxed_min (local.get 0) (local.get 1)))) + (func (export "f64x2.relaxed_max_cmp") (param v128 v128) (result v128) + (i64x2.eq + (f64x2.relaxed_max (local.get 0) (local.get 1)) + (f64x2.relaxed_max (local.get 0) (local.get 1)))) +) + +(assert_return (invoke "f32x4.relaxed_min" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (either (v128.const f32x4 nan:canonical nan:canonical nan:canonical nan:canonical) + (v128.const f32x4 nan:canonical nan:canonical 0 0) + (v128.const f32x4 0 0 nan:canonical nan:canonical) + (v128.const f32x4 0 0 0 0))) + +(assert_return (invoke "f32x4.relaxed_min" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (either (v128.const f32x4 -0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 -0.0 +0.0 -0.0))) + +(assert_return (invoke "f32x4.relaxed_max" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (either (v128.const f32x4 nan:canonical nan:canonical nan:canonical nan:canonical) + (v128.const f32x4 nan:canonical nan:canonical 0 0) + (v128.const f32x4 0 0 nan:canonical nan:canonical) + (v128.const f32x4 0 0 0 0))) + +(assert_return (invoke "f32x4.relaxed_max" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (either (v128.const f32x4 +0.0 +0.0 +0.0 -0.0) + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 -0.0 +0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (either (v128.const f64x2 -0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0) + (v128.const f64x2 -0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (either (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (either (v128.const f64x2 +0.0 +0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0) + (v128.const f64x2 -0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (either (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "f32x4.relaxed_min_cmp" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f32x4.relaxed_min_cmp" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f32x4.relaxed_max_cmp" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f32x4.relaxed_max_cmp" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (v128.const i64x2 -1 -1)) diff --git a/test/core/run.py b/test/core/run.py index 8437837c8..d15379977 100755 --- a/test/core/run.py +++ b/test/core/run.py @@ -33,9 +33,10 @@ main_test_files = glob.glob(os.path.join(inputDir, "*.wast")) # Other test files are in subdirectories simd_test_files = glob.glob(os.path.join(inputDir, "simd", "*.wast")) +relaxed_simd_test_files = glob.glob(os.path.join(inputDir, "relaxed-simd", "*.wast")) gc_test_files = glob.glob(os.path.join(inputDir, "gc", "*.wast")) multi_memory_test_files = glob.glob(os.path.join(inputDir, "multi-memory", "*.wast")) -all_test_files = main_test_files + simd_test_files + gc_test_files + multi_memory_test_files +all_test_files = main_test_files + simd_test_files + relaxed_simd_test_files + gc_test_files + multi_memory_test_files wasmExec = arguments.wasm wasmCommand = wasmExec + " " + arguments.opts