From d6dc7cf71bf52f866c092e92ce374f0266ebee1a Mon Sep 17 00:00:00 2001 From: Danial Klimkin Date: Fri, 30 Aug 2024 10:21:55 +0200 Subject: [PATCH 01/98] Fix bazel build past 89e6a288674c (#106685) --- utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index 1bf6cdbb447a4c..b2dcc696b0ad06 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -4160,6 +4160,7 @@ cc_library( ":Option", ":Support", ":WindowsManifest", + ":config", ], ) @@ -4601,6 +4602,7 @@ cc_binary( ":Target", ":TargetParser", ":TransformUtils", + ":config", ], ) From 0722b8ab8168d9e1aa3413a62c65878f407225ae Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Fri, 30 Aug 2024 16:29:18 +0800 Subject: [PATCH 02/98] [Clang][NFC] Consolidate tests for default argument substitution (#105617) Follow-up on 8ac140f39. The test `SemaTemplate/default-parm-init.cpp` was introduced since the fix #80288 and mainly did the following things: - Ensure the default arguments are properly substituted inside either the primary template & their explicit / out-of-line specializations. - Ensure the strategy doesn't mess up the substitution of a lambda expression as a default argument. The 1st is for the bug of #68490, yet it does some redundant work: each of the member functions is duplicated twice for the `sizeof` and `alignof` operators, respectively, and the principle under the hood are essentially the same. So this patch removes the duplication and reduces the 8 functions to 4 functions that reveal the same thing. The 2nd is presumably testing that the fix in #80288 doesn't impact a complicated substitution. However, that seems unnecessary & unrelated to the original issue. And more importantly, we don't have any problems with that ever. Hence, I'll remove that test from this patch. The test for default arguments is merged into `SemaTemplate/default-arguments.cpp` with a new namespace, and hopefully this could reduce the entropy of our testing cases. --- clang/test/SemaTemplate/default-arguments.cpp | 52 +++++ clang/test/SemaTemplate/default-parm-init.cpp | 190 ------------------ 2 files changed, 52 insertions(+), 190 deletions(-) delete mode 100644 clang/test/SemaTemplate/default-parm-init.cpp diff --git a/clang/test/SemaTemplate/default-arguments.cpp b/clang/test/SemaTemplate/default-arguments.cpp index d5d9687cc90f49..3b1fbda414c12b 100644 --- a/clang/test/SemaTemplate/default-arguments.cpp +++ b/clang/test/SemaTemplate/default-arguments.cpp @@ -229,3 +229,55 @@ namespace unevaluated { template int f(int = a); // expected-warning 0-1{{extension}} int k = sizeof(f()); } + +#if __cplusplus >= 201103L +namespace GH68490 { + +template struct S { + template + constexpr int SizeOfU(int param = sizeof(U)) const; + + template + constexpr int SizeOfT(int param = sizeof(T)) const; +}; + +template struct S { + template + constexpr int SizeOfU(int param = sizeof(U)) const; + + template + constexpr int SizeOfT(int param = sizeof(T *)) const; +}; + +template +template +constexpr int S::SizeOfU(int param) const { + return param; +} + +template +template +constexpr int S::SizeOfT(int param) const { + return param; +} + +template <> +template +constexpr int S::SizeOfU(int param) const { + return param; +} + +template <> +template +constexpr int S::SizeOfT(int param) const { + return param; +} + +static_assert(S().SizeOfU() == sizeof(char), ""); +static_assert(S().SizeOfT() == sizeof(int), ""); +static_assert(S().SizeOfU() == sizeof(char), ""); +static_assert(S().SizeOfT() == sizeof(short *), ""); + +} // namespace GH68490 + +#endif diff --git a/clang/test/SemaTemplate/default-parm-init.cpp b/clang/test/SemaTemplate/default-parm-init.cpp deleted file mode 100644 index 73ba8998df6a98..00000000000000 --- a/clang/test/SemaTemplate/default-parm-init.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// RUN: %clang_cc1 -fsyntax-only -std=c++17 -verify %s -// RUN: %clang_cc1 -fsyntax-only -std=c++20 -verify %s -// expected-no-diagnostics - -namespace std { - -template class function; - -template class invoker_base { -public: - virtual ~invoker_base() { } - virtual R invoke(Args...) = 0; - virtual invoker_base* clone() = 0; -}; - -template -class functor_invoker : public invoker_base { -public: - explicit functor_invoker(const F& f) : f(f) { } - R invoke(Args... args) { return f(args...); } - functor_invoker* clone() { return new functor_invoker(f); } - -private: - F f; -}; - -template -class function { -public: - typedef R result_type; - function() : invoker (0) { } - function(const function& other) : invoker(0) { - if (other.invoker) - invoker = other.invoker->clone(); - } - - template function(const F& f) : invoker(0) { - invoker = new functor_invoker(f); - } - - ~function() { - if (invoker) - delete invoker; - } - - function& operator=(const function& other) { - function(other).swap(*this); - return *this; - } - - template - function& operator=(const F& f) { - function(f).swap(*this); - return *this; - } - - void swap(function& other) { - invoker_base* tmp = invoker; - invoker = other.invoker; - other.invoker = tmp; - } - - result_type operator()(Args... args) const { - return invoker->invoke(args...); - } - -private: - invoker_base* invoker; -}; - -} - -template -struct Problem { - template - constexpr int FuncAlign(int param = alignof(FunctionTemplateParam)); - - template - constexpr int FuncSizeof(int param = sizeof(FunctionTemplateParam)); - - template - constexpr int FuncAlign2(int param = alignof(TemplateParam)); - - template - constexpr int FuncSizeof2(int param = sizeof(TemplateParam)); -}; - -template -struct Problem { - template - constexpr int FuncAlign(int param = alignof(FunctionTemplateParam)); - - template - constexpr int FuncSizeof(int param = sizeof(FunctionTemplateParam)); - - template - constexpr int FuncAlign2(int param = alignof(TemplateParam)); - - template - constexpr int FuncSizeof2(int param = sizeof(TemplateParam)); -}; - -template -template -constexpr int Problem::FuncAlign(int param) { - return 2U*param; -} - -template -template -constexpr int Problem::FuncSizeof(int param) { - return 2U*param; -} - -template -template -constexpr int Problem::FuncAlign2(int param) { - return 2U*param; -} - -template -template -constexpr int Problem::FuncSizeof2(int param) { - return 2U*param; -} - -template <> -template -constexpr int Problem::FuncAlign(int param) { - return param; -} - -template <> -template -constexpr int Problem::FuncSizeof(int param) { - return param; -} - -template <> -template -constexpr int Problem::FuncAlign2(int param) { - return param; -} - -template <> -template -constexpr int Problem::FuncSizeof2(int param) { - return param; -} - -void foo() { - Problem p = {}; - static_assert(p.FuncAlign() == alignof(char)); - static_assert(p.FuncSizeof() == sizeof(char)); - static_assert(p.FuncAlign2() == alignof(int)); - static_assert(p.FuncSizeof2() == sizeof(int)); - Problem q = {}; - static_assert(q.FuncAlign() == 2U * alignof(char)); - static_assert(q.FuncSizeof() == 2U * sizeof(char)); - static_assert(q.FuncAlign2() == 2U *alignof(short)); - static_assert(q.FuncSizeof2() == 2U * sizeof(short)); -} - -template -class A { - public: - void run( - std::function f1 = [](auto&&) {}, - std::function f2 = [](auto&&) {}); - private: - class Helper { - public: - explicit Helper(std::function f2) : f2_(f2) {} - std::function f2_; - }; -}; - -template -void A::run(std::function f1, - std::function f2) { - Helper h(f2); -} - -struct B {}; - -int main() { - A a; - a.run([&](auto& l) {}); - return 0; -} From 5b77e254e814eb9a56d31c30a5c8289c07d8a6ff Mon Sep 17 00:00:00 2001 From: wanglei Date: Fri, 30 Aug 2024 16:37:20 +0800 Subject: [PATCH 03/98] [LoongArch] Pre-commit test for immediate value materialization using BSTRINS_D Reviewed By: SixWeining Pull Request: https://github.com/llvm/llvm-project/pull/106331 --- llvm/test/CodeGen/LoongArch/imm.ll | 62 ++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/llvm/test/CodeGen/LoongArch/imm.ll b/llvm/test/CodeGen/LoongArch/imm.ll index f84fddaec66b9f..746306bacc8d57 100644 --- a/llvm/test/CodeGen/LoongArch/imm.ll +++ b/llvm/test/CodeGen/LoongArch/imm.ll @@ -164,3 +164,65 @@ define i64 @imm0008000080000800() { ; CHECK-NEXT: ret ret i64 2251801961170944 } + +define i64 @imm14000000a() { +; CHECK-LABEL: imm14000000a: +; CHECK: # %bb.0: +; CHECK-NEXT: lu12i.w $a0, 262144 +; CHECK-NEXT: ori $a0, $a0, 10 +; CHECK-NEXT: lu32i.d $a0, 1 +; CHECK-NEXT: ret + ret i64 5368709130 +} + +define i64 @imm0fff000000000fff() { +; CHECK-LABEL: imm0fff000000000fff: +; CHECK: # %bb.0: +; CHECK-NEXT: ori $a0, $zero, 4095 +; CHECK-NEXT: lu32i.d $a0, -65536 +; CHECK-NEXT: lu52i.d $a0, $a0, 255 +; CHECK-NEXT: ret + ret i64 1152640029630140415 +} + +define i64 @immffecffffffffffec() { +; CHECK-LABEL: immffecffffffffffec: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.w $a0, $zero, -20 +; CHECK-NEXT: lu32i.d $a0, -196609 +; CHECK-NEXT: lu52i.d $a0, $a0, -2 +; CHECK-NEXT: ret + ret i64 -5348024557502484 +} + +define i64 @imm1c000000700000() { +; CHECK-LABEL: imm1c000000700000: +; CHECK: # %bb.0: +; CHECK-NEXT: lu12i.w $a0, 1792 +; CHECK-NEXT: lu32i.d $a0, -262144 +; CHECK-NEXT: lu52i.d $a0, $a0, 1 +; CHECK-NEXT: ret + ret i64 7881299355238400 +} + +define i64 @immf0f0f0f0f0f0f0f0() { +; CHECK-LABEL: immf0f0f0f0f0f0f0f0: +; CHECK: # %bb.0: +; CHECK-NEXT: lu12i.w $a0, -61681 +; CHECK-NEXT: ori $a0, $a0, 240 +; CHECK-NEXT: lu32i.d $a0, 61680 +; CHECK-NEXT: lu52i.d $a0, $a0, -241 +; CHECK-NEXT: ret + ret i64 -1085102592571150096 +} + +define i64 @imm110000014000000a() { +; CHECK-LABEL: imm110000014000000a: +; CHECK: # %bb.0: +; CHECK-NEXT: lu12i.w $a0, 262144 +; CHECK-NEXT: ori $a0, $a0, 10 +; CHECK-NEXT: lu32i.d $a0, 1 +; CHECK-NEXT: lu52i.d $a0, $a0, 272 +; CHECK-NEXT: ret + ret i64 1224979104013484042 +} From eaf87d32754beb5bec10bab517bf56e25575b48e Mon Sep 17 00:00:00 2001 From: wanglei Date: Tue, 27 Aug 2024 15:13:15 +0800 Subject: [PATCH 04/98] [LoongArch] Optimize for immediate value materialization using BSTRINS_D instruction Reviewed By: heiher, SixWeining Pull Request: https://github.com/llvm/llvm-project/pull/106332 --- .../AsmParser/LoongArchAsmParser.cpp | 30 ++++++-- .../LoongArch/LoongArchISelDAGToDAG.cpp | 22 +++++- .../Target/LoongArch/LoongArchInstrInfo.cpp | 8 ++ .../MCTargetDesc/LoongArchMatInt.cpp | 73 +++++++++++++++++++ .../LoongArch/MCTargetDesc/LoongArchMatInt.h | 1 + .../test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll | 12 +-- llvm/test/CodeGen/LoongArch/imm.ll | 25 +++---- .../LoongArch/ir-instruction/load-store.ll | 10 +-- .../CodeGen/LoongArch/merge-base-offset.ll | 6 +- llvm/test/CodeGen/LoongArch/sextw-removal.ll | 40 ++++------ llvm/test/MC/LoongArch/Macros/macros-li.s | 3 +- 11 files changed, 161 insertions(+), 69 deletions(-) diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp index c2ae4a0734b6a7..b8f1cdfd2cb354 100644 --- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp +++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp @@ -1291,14 +1291,32 @@ void LoongArchAsmParser::emitLoadImm(MCInst &Inst, SMLoc IDLoc, Imm = SignExtend64<32>(Imm); for (LoongArchMatInt::Inst &Inst : LoongArchMatInt::generateInstSeq(Imm)) { - unsigned Opc = Inst.Opc; - if (Opc == LoongArch::LU12I_W) - Out.emitInstruction(MCInstBuilder(Opc).addReg(DestReg).addImm(Inst.Imm), - getSTI()); - else + switch (Inst.Opc) { + case LoongArch::LU12I_W: Out.emitInstruction( - MCInstBuilder(Opc).addReg(DestReg).addReg(SrcReg).addImm(Inst.Imm), + MCInstBuilder(Inst.Opc).addReg(DestReg).addImm(Inst.Imm), getSTI()); + break; + case LoongArch::ADDI_W: + case LoongArch::ORI: + case LoongArch::LU32I_D: + case LoongArch::LU52I_D: + Out.emitInstruction( + MCInstBuilder(Inst.Opc).addReg(DestReg).addReg(SrcReg).addImm( + Inst.Imm), getSTI()); + break; + case LoongArch::BSTRINS_D: + Out.emitInstruction(MCInstBuilder(Inst.Opc) + .addReg(DestReg) + .addReg(SrcReg) + .addReg(SrcReg) + .addImm(Inst.Imm >> 32) + .addImm(Inst.Imm & 0xFF), + getSTI()); + break; + default: + llvm_unreachable("unexpected opcode generated by LoongArchMatInt"); + } SrcReg = DestReg; } } diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp index b6ade6b978d2ce..70ed1e6fbdbdac 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp @@ -62,10 +62,26 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) { // The instructions in the sequence are handled here. for (LoongArchMatInt::Inst &Inst : LoongArchMatInt::generateInstSeq(Imm)) { SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, GRLenVT); - if (Inst.Opc == LoongArch::LU12I_W) - Result = CurDAG->getMachineNode(LoongArch::LU12I_W, DL, GRLenVT, SDImm); - else + switch (Inst.Opc) { + case LoongArch::LU12I_W: + Result = CurDAG->getMachineNode(Inst.Opc, DL, GRLenVT, SDImm); + break; + case LoongArch::ADDI_W: + case LoongArch::ORI: + case LoongArch::LU32I_D: + case LoongArch::LU52I_D: Result = CurDAG->getMachineNode(Inst.Opc, DL, GRLenVT, SrcReg, SDImm); + break; + case LoongArch::BSTRINS_D: + Result = CurDAG->getMachineNode( + Inst.Opc, DL, GRLenVT, + {SrcReg, SrcReg, + CurDAG->getTargetConstant(Inst.Imm >> 32, DL, GRLenVT), + CurDAG->getTargetConstant(Inst.Imm & 0xFF, DL, GRLenVT)}); + break; + default: + llvm_unreachable("unexpected opcode generated by LoongArchMatInt"); + } SrcReg = SDValue(Result, 0); } diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index 9059da460f1358..d1af65192ee612 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -210,6 +210,14 @@ void LoongArchInstrInfo::movImm(MachineBasicBlock &MBB, .addImm(Inst.Imm) .setMIFlag(Flag); break; + case LoongArch::BSTRINS_D: + BuildMI(MBB, MBBI, DL, get(Inst.Opc), DstReg) + .addReg(SrcReg, RegState::Kill) + .addReg(SrcReg, RegState::Kill) + .addImm(Inst.Imm >> 32) + .addImm(Inst.Imm & 0xFF) + .setMIFlag(Flag); + break; default: assert(false && "Unknown insn emitted by LoongArchMatInt"); } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp index 1509c436c81098..1ce1a9845db21c 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp @@ -26,11 +26,13 @@ LoongArchMatInt::InstSeq LoongArchMatInt::generateInstSeq(int64_t Val) { const int64_t Lo12 = Val & 0xFFF; InstSeq Insts; + // LU52I_D used for: Bits[63:52] | Bits[51:0]. if (Highest12 != 0 && SignExtend64<52>(Val) == 0) { Insts.push_back(Inst(LoongArch::LU52I_D, SignExtend64<12>(Highest12))); return Insts; } + // lo32 if (Hi20 == 0) Insts.push_back(Inst(LoongArch::ORI, Lo12)); else if (SignExtend32<1>(Lo12 >> 11) == SignExtend32<20>(Hi20)) @@ -41,11 +43,82 @@ LoongArchMatInt::InstSeq LoongArchMatInt::generateInstSeq(int64_t Val) { Insts.push_back(Inst(LoongArch::ORI, Lo12)); } + // hi32 + // Higher20 if (SignExtend32<1>(Hi20 >> 19) != SignExtend32<20>(Higher20)) Insts.push_back(Inst(LoongArch::LU32I_D, SignExtend64<20>(Higher20))); + // Highest12 if (SignExtend32<1>(Higher20 >> 19) != SignExtend32<12>(Highest12)) Insts.push_back(Inst(LoongArch::LU52I_D, SignExtend64<12>(Highest12))); + size_t N = Insts.size(); + if (N < 3) + return Insts; + + // When the number of instruction sequences is greater than 2, we have the + // opportunity to optimize using the BSTRINS_D instruction. The scenario is as + // follows: + // + // N of Insts = 3 + // 1. ORI + LU32I_D + LU52I_D => ORI + BSTRINS_D, TmpVal = ORI + // 2. ADDI_W + LU32I_D + LU52I_D => ADDI_W + BSTRINS_D, TmpVal = ADDI_W + // 3. LU12I_W + ORI + LU32I_D => ORI + BSTRINS_D, TmpVal = ORI + // 4. LU12I_W + LU32I_D + LU52I_D => LU12I_W + BSTRINS_D, TmpVal = LU12I_W + // + // N of Insts = 4 + // 5. LU12I_W + ORI + LU32I_D + LU52I_D => LU12I_W + ORI + BSTRINS_D + // => ORI + LU52I_D + BSTRINS_D + // TmpVal = (LU12I_W | ORI) or (ORI | LU52I_D) + // The BSTRINS_D instruction will use the `TmpVal` to construct the `Val`. + uint64_t TmpVal1 = 0; + uint64_t TmpVal2 = 0; + switch (Insts[0].Opc) { + default: + llvm_unreachable("unexpected opcode"); + break; + case LoongArch::LU12I_W: + if (Insts[1].Opc == LoongArch::ORI) { + TmpVal1 = Insts[1].Imm; + if (N == 3) + break; + TmpVal2 = Insts[3].Imm << 52 | TmpVal1; + } + TmpVal1 |= Insts[0].Imm << 12; + break; + case LoongArch::ORI: + case LoongArch::ADDI_W: + TmpVal1 = Insts[0].Imm; + break; + } + + for (uint64_t Msb = 32; Msb < 64; ++Msb) { + uint64_t HighMask = ~((1ULL << (Msb + 1)) - 1); + for (uint64_t Lsb = Msb; Lsb > 0; --Lsb) { + uint64_t LowMask = (1ULL << Lsb) - 1; + uint64_t Mask = HighMask | LowMask; + uint64_t LsbToZero = TmpVal1 & ((1UL << (Msb - Lsb + 1)) - 1); + uint64_t MsbToLsb = LsbToZero << Lsb; + if ((MsbToLsb | (TmpVal1 & Mask)) == (uint64_t)Val) { + if (Insts[1].Opc == LoongArch::ORI && N == 3) + Insts[0] = Insts[1]; + Insts.pop_back_n(2); + Insts.push_back(Inst(LoongArch::BSTRINS_D, Msb << 32 | Lsb)); + return Insts; + } + if (TmpVal2 != 0) { + LsbToZero = TmpVal2 & ((1UL << (Msb - Lsb + 1)) - 1); + MsbToLsb = LsbToZero << Lsb; + if ((MsbToLsb | (TmpVal2 & Mask)) == (uint64_t)Val) { + Insts[0] = Insts[1]; + Insts[1] = Insts[3]; + Insts.pop_back_n(2); + Insts.push_back(Inst(LoongArch::BSTRINS_D, Msb << 32 | Lsb)); + return Insts; + } + } + } + } + return Insts; } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h index be1b425894de1a..3a3c12c353fb8e 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h @@ -16,6 +16,7 @@ namespace llvm { namespace LoongArchMatInt { struct Inst { unsigned Opc; + // Imm: Opc's imm operand, if Opc == BSTRINS_D, Imm = MSB << 32 | LSB. int64_t Imm; Inst(unsigned Opc, int64_t Imm) : Opc(Opc), Imm(Imm) {} }; diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll index f17cec231f3236..3efdd08bbea4c4 100644 --- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll @@ -338,14 +338,12 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; LA64-NEXT: srli.d $a1, $a0, 1 ; LA64-NEXT: lu12i.w $a2, 349525 ; LA64-NEXT: ori $a2, $a2, 1365 -; LA64-NEXT: lu32i.d $a2, 349525 -; LA64-NEXT: lu52i.d $a2, $a2, 1365 +; LA64-NEXT: bstrins.d $a2, $a2, 62, 32 ; LA64-NEXT: and $a1, $a1, $a2 ; LA64-NEXT: sub.d $a0, $a0, $a1 ; LA64-NEXT: lu12i.w $a1, 209715 ; LA64-NEXT: ori $a1, $a1, 819 -; LA64-NEXT: lu32i.d $a1, 209715 -; LA64-NEXT: lu52i.d $a1, $a1, 819 +; LA64-NEXT: bstrins.d $a1, $a1, 61, 32 ; LA64-NEXT: and $a2, $a0, $a1 ; LA64-NEXT: srli.d $a0, $a0, 2 ; LA64-NEXT: and $a0, $a0, $a1 @@ -354,13 +352,11 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; LA64-NEXT: add.d $a0, $a0, $a1 ; LA64-NEXT: lu12i.w $a1, 61680 ; LA64-NEXT: ori $a1, $a1, 3855 -; LA64-NEXT: lu32i.d $a1, -61681 -; LA64-NEXT: lu52i.d $a1, $a1, 240 +; LA64-NEXT: bstrins.d $a1, $a1, 59, 32 ; LA64-NEXT: and $a0, $a0, $a1 ; LA64-NEXT: lu12i.w $a1, 4112 ; LA64-NEXT: ori $a1, $a1, 257 -; LA64-NEXT: lu32i.d $a1, 65793 -; LA64-NEXT: lu52i.d $a1, $a1, 16 +; LA64-NEXT: bstrins.d $a1, $a1, 56, 32 ; LA64-NEXT: mul.d $a0, $a0, $a1 ; LA64-NEXT: srli.d $a0, $a0, 56 ; LA64-NEXT: ret diff --git a/llvm/test/CodeGen/LoongArch/imm.ll b/llvm/test/CodeGen/LoongArch/imm.ll index 746306bacc8d57..aca508e99fb960 100644 --- a/llvm/test/CodeGen/LoongArch/imm.ll +++ b/llvm/test/CodeGen/LoongArch/imm.ll @@ -47,8 +47,7 @@ define i64 @imm0008000000000fff() { ; CHECK-LABEL: imm0008000000000fff: ; CHECK: # %bb.0: ; CHECK-NEXT: ori $a0, $zero, 4095 -; CHECK-NEXT: lu32i.d $a0, -524288 -; CHECK-NEXT: lu52i.d $a0, $a0, 0 +; CHECK-NEXT: bstrins.d $a0, $a0, 51, 51 ; CHECK-NEXT: ret ret i64 2251799813689343 } @@ -168,9 +167,8 @@ define i64 @imm0008000080000800() { define i64 @imm14000000a() { ; CHECK-LABEL: imm14000000a: ; CHECK: # %bb.0: -; CHECK-NEXT: lu12i.w $a0, 262144 -; CHECK-NEXT: ori $a0, $a0, 10 -; CHECK-NEXT: lu32i.d $a0, 1 +; CHECK-NEXT: ori $a0, $zero, 10 +; CHECK-NEXT: bstrins.d $a0, $a0, 32, 29 ; CHECK-NEXT: ret ret i64 5368709130 } @@ -179,8 +177,7 @@ define i64 @imm0fff000000000fff() { ; CHECK-LABEL: imm0fff000000000fff: ; CHECK: # %bb.0: ; CHECK-NEXT: ori $a0, $zero, 4095 -; CHECK-NEXT: lu32i.d $a0, -65536 -; CHECK-NEXT: lu52i.d $a0, $a0, 255 +; CHECK-NEXT: bstrins.d $a0, $a0, 59, 48 ; CHECK-NEXT: ret ret i64 1152640029630140415 } @@ -189,8 +186,7 @@ define i64 @immffecffffffffffec() { ; CHECK-LABEL: immffecffffffffffec: ; CHECK: # %bb.0: ; CHECK-NEXT: addi.w $a0, $zero, -20 -; CHECK-NEXT: lu32i.d $a0, -196609 -; CHECK-NEXT: lu52i.d $a0, $a0, -2 +; CHECK-NEXT: bstrins.d $a0, $a0, 52, 48 ; CHECK-NEXT: ret ret i64 -5348024557502484 } @@ -199,8 +195,7 @@ define i64 @imm1c000000700000() { ; CHECK-LABEL: imm1c000000700000: ; CHECK: # %bb.0: ; CHECK-NEXT: lu12i.w $a0, 1792 -; CHECK-NEXT: lu32i.d $a0, -262144 -; CHECK-NEXT: lu52i.d $a0, $a0, 1 +; CHECK-NEXT: bstrins.d $a0, $a0, 52, 30 ; CHECK-NEXT: ret ret i64 7881299355238400 } @@ -210,8 +205,7 @@ define i64 @immf0f0f0f0f0f0f0f0() { ; CHECK: # %bb.0: ; CHECK-NEXT: lu12i.w $a0, -61681 ; CHECK-NEXT: ori $a0, $a0, 240 -; CHECK-NEXT: lu32i.d $a0, 61680 -; CHECK-NEXT: lu52i.d $a0, $a0, -241 +; CHECK-NEXT: bstrins.d $a0, $a0, 59, 32 ; CHECK-NEXT: ret ret i64 -1085102592571150096 } @@ -219,10 +213,9 @@ define i64 @immf0f0f0f0f0f0f0f0() { define i64 @imm110000014000000a() { ; CHECK-LABEL: imm110000014000000a: ; CHECK: # %bb.0: -; CHECK-NEXT: lu12i.w $a0, 262144 -; CHECK-NEXT: ori $a0, $a0, 10 -; CHECK-NEXT: lu32i.d $a0, 1 +; CHECK-NEXT: ori $a0, $zero, 10 ; CHECK-NEXT: lu52i.d $a0, $a0, 272 +; CHECK-NEXT: bstrins.d $a0, $a0, 32, 29 ; CHECK-NEXT: ret ret i64 1224979104013484042 } diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll index 772ae8d81a88bf..9654542f877459 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll @@ -973,9 +973,8 @@ define i64 @ld_sd_constant(i64 %a) nounwind { ; LA64NOPIC-LABEL: ld_sd_constant: ; LA64NOPIC: # %bb.0: ; LA64NOPIC-NEXT: lu12i.w $a1, -136485 -; LA64NOPIC-NEXT: ori $a1, $a1, 3823 -; LA64NOPIC-NEXT: lu32i.d $a1, -147729 -; LA64NOPIC-NEXT: lu52i.d $a2, $a1, -534 +; LA64NOPIC-NEXT: ori $a2, $a1, 3823 +; LA64NOPIC-NEXT: bstrins.d $a2, $a2, 61, 32 ; LA64NOPIC-NEXT: ld.d $a1, $a2, 0 ; LA64NOPIC-NEXT: st.d $a0, $a2, 0 ; LA64NOPIC-NEXT: move $a0, $a1 @@ -984,9 +983,8 @@ define i64 @ld_sd_constant(i64 %a) nounwind { ; LA64PIC-LABEL: ld_sd_constant: ; LA64PIC: # %bb.0: ; LA64PIC-NEXT: lu12i.w $a1, -136485 -; LA64PIC-NEXT: ori $a1, $a1, 3823 -; LA64PIC-NEXT: lu32i.d $a1, -147729 -; LA64PIC-NEXT: lu52i.d $a2, $a1, -534 +; LA64PIC-NEXT: ori $a2, $a1, 3823 +; LA64PIC-NEXT: bstrins.d $a2, $a2, 61, 32 ; LA64PIC-NEXT: ld.d $a1, $a2, 0 ; LA64PIC-NEXT: st.d $a0, $a2, 0 ; LA64PIC-NEXT: move $a0, $a1 diff --git a/llvm/test/CodeGen/LoongArch/merge-base-offset.ll b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll index 1e7a79beb62c61..323858c7613a67 100644 --- a/llvm/test/CodeGen/LoongArch/merge-base-offset.ll +++ b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll @@ -1128,8 +1128,7 @@ define dso_local ptr @load_addr_offset_614750729487779976() nounwind { ; LA64-NEXT: addi.d $a0, $a0, %pc_lo12(g_a64) ; LA64-NEXT: lu12i.w $a1, 279556 ; LA64-NEXT: ori $a1, $a1, 1088 -; LA64-NEXT: lu32i.d $a1, 17472 -; LA64-NEXT: lu52i.d $a1, $a1, 1092 +; LA64-NEXT: bstrins.d $a1, $a1, 62, 32 ; LA64-NEXT: add.d $a0, $a0, $a1 ; LA64-NEXT: ret ; @@ -1142,8 +1141,7 @@ define dso_local ptr @load_addr_offset_614750729487779976() nounwind { ; LA64-LARGE-NEXT: add.d $a0, $a1, $a0 ; LA64-LARGE-NEXT: lu12i.w $a1, 279556 ; LA64-LARGE-NEXT: ori $a1, $a1, 1088 -; LA64-LARGE-NEXT: lu32i.d $a1, 17472 -; LA64-LARGE-NEXT: lu52i.d $a1, $a1, 1092 +; LA64-LARGE-NEXT: bstrins.d $a1, $a1, 62, 32 ; LA64-LARGE-NEXT: add.d $a0, $a0, $a1 ; LA64-LARGE-NEXT: ret entry: diff --git a/llvm/test/CodeGen/LoongArch/sextw-removal.ll b/llvm/test/CodeGen/LoongArch/sextw-removal.ll index 2bb39395c1d1b6..7500b5ae09359a 100644 --- a/llvm/test/CodeGen/LoongArch/sextw-removal.ll +++ b/llvm/test/CodeGen/LoongArch/sextw-removal.ll @@ -323,21 +323,17 @@ define void @test7(i32 signext %arg, i32 signext %arg1) nounwind { ; CHECK-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill ; CHECK-NEXT: sra.w $a0, $a0, $a1 ; CHECK-NEXT: lu12i.w $a1, 349525 -; CHECK-NEXT: ori $a1, $a1, 1365 -; CHECK-NEXT: lu32i.d $a1, 349525 -; CHECK-NEXT: lu52i.d $fp, $a1, 1365 +; CHECK-NEXT: ori $fp, $a1, 1365 +; CHECK-NEXT: bstrins.d $fp, $fp, 62, 32 ; CHECK-NEXT: lu12i.w $a1, 209715 -; CHECK-NEXT: ori $a1, $a1, 819 -; CHECK-NEXT: lu32i.d $a1, 209715 -; CHECK-NEXT: lu52i.d $s0, $a1, 819 +; CHECK-NEXT: ori $s0, $a1, 819 +; CHECK-NEXT: bstrins.d $s0, $s0, 61, 32 ; CHECK-NEXT: lu12i.w $a1, 61680 -; CHECK-NEXT: ori $a1, $a1, 3855 -; CHECK-NEXT: lu32i.d $a1, -61681 -; CHECK-NEXT: lu52i.d $s1, $a1, 240 +; CHECK-NEXT: ori $s1, $a1, 3855 +; CHECK-NEXT: bstrins.d $s1, $s1, 59, 32 ; CHECK-NEXT: lu12i.w $a1, 4112 -; CHECK-NEXT: ori $a1, $a1, 257 -; CHECK-NEXT: lu32i.d $a1, 65793 -; CHECK-NEXT: lu52i.d $s2, $a1, 16 +; CHECK-NEXT: ori $s2, $a1, 257 +; CHECK-NEXT: bstrins.d $s2, $s2, 56, 32 ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB6_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -374,21 +370,17 @@ define void @test7(i32 signext %arg, i32 signext %arg1) nounwind { ; NORMV-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill ; NORMV-NEXT: sra.w $a0, $a0, $a1 ; NORMV-NEXT: lu12i.w $a1, 349525 -; NORMV-NEXT: ori $a1, $a1, 1365 -; NORMV-NEXT: lu32i.d $a1, 349525 -; NORMV-NEXT: lu52i.d $fp, $a1, 1365 +; NORMV-NEXT: ori $fp, $a1, 1365 +; NORMV-NEXT: bstrins.d $fp, $fp, 62, 32 ; NORMV-NEXT: lu12i.w $a1, 209715 -; NORMV-NEXT: ori $a1, $a1, 819 -; NORMV-NEXT: lu32i.d $a1, 209715 -; NORMV-NEXT: lu52i.d $s0, $a1, 819 +; NORMV-NEXT: ori $s0, $a1, 819 +; NORMV-NEXT: bstrins.d $s0, $s0, 61, 32 ; NORMV-NEXT: lu12i.w $a1, 61680 -; NORMV-NEXT: ori $a1, $a1, 3855 -; NORMV-NEXT: lu32i.d $a1, -61681 -; NORMV-NEXT: lu52i.d $s1, $a1, 240 +; NORMV-NEXT: ori $s1, $a1, 3855 +; NORMV-NEXT: bstrins.d $s1, $s1, 59, 32 ; NORMV-NEXT: lu12i.w $a1, 4112 -; NORMV-NEXT: ori $a1, $a1, 257 -; NORMV-NEXT: lu32i.d $a1, 65793 -; NORMV-NEXT: lu52i.d $s2, $a1, 16 +; NORMV-NEXT: ori $s2, $a1, 257 +; NORMV-NEXT: bstrins.d $s2, $s2, 56, 32 ; NORMV-NEXT: .p2align 4, , 16 ; NORMV-NEXT: .LBB6_1: # %bb2 ; NORMV-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/MC/LoongArch/Macros/macros-li.s b/llvm/test/MC/LoongArch/Macros/macros-li.s index 994aa439effa1b..8ac82a766f6043 100644 --- a/llvm/test/MC/LoongArch/Macros/macros-li.s +++ b/llvm/test/MC/LoongArch/Macros/macros-li.s @@ -45,8 +45,7 @@ li.d $a0, 0x7ffff00000800 li.d $a0, 0x8000000000fff # CHECK: ori $a0, $zero, 4095 -# CHECK-NEXT: lu32i.d $a0, -524288 -# CHECK-NEXT: lu52i.d $a0, $a0, 0 +# CHECK-NEXT: bstrins.d $a0, $a0, 51, 51 li.d $a0, 0x8000080000800 # CHECK: lu12i.w $a0, -524288 From 8f4aafb58ceb2c60f8f13d475d0623c696cd5716 Mon Sep 17 00:00:00 2001 From: Shih-Po Hung Date: Fri, 30 Aug 2024 17:17:56 +0800 Subject: [PATCH 05/98] [RISCV][NFC] Splits f16 cast tests into a separate file (#106692) precommit f16 test for #87506 fp-int conversion --- .../Analysis/CostModel/RISCV/cast-half.ll | 1469 +++++++++++++++++ llvm/test/Analysis/CostModel/RISCV/cast.ll | 840 ---------- 2 files changed, 1469 insertions(+), 840 deletions(-) create mode 100644 llvm/test/Analysis/CostModel/RISCV/cast-half.ll diff --git a/llvm/test/Analysis/CostModel/RISCV/cast-half.ll b/llvm/test/Analysis/CostModel/RISCV/cast-half.ll new file mode 100644 index 00000000000000..e20d24c27eb8b4 --- /dev/null +++ b/llvm/test/Analysis/CostModel/RISCV/cast-half.ll @@ -0,0 +1,1469 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefixes=RV32ZVFH +; RUN: opt < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfhmin -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefixes=RV32ZVFHMIN +; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefixes=RV64ZVFH +; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefixes=RV64ZVFHMIN + +define void @fptosi() { +; RV32ZVFH-LABEL: 'fptosi' +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64f16_nxv64i64 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptosi undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; RV32ZVFHMIN-LABEL: 'fptosi' +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64f16_nxv64i64 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptosi undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; RV64ZVFH-LABEL: 'fptosi' +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64f16_nxv64i64 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptosi undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; RV64ZVFHMIN-LABEL: 'fptosi' +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64f16_nxv64i64 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptosi undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8> + %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16> + %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32> + %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64> + %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1> + %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8> + %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16> + %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32> + %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64> + %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1> + %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8> + %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16> + %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32> + %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64> + %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1> + %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8> + %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16> + %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32> + %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64> + %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1> + %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8> + %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16> + %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32> + %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64> + %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1> + %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8> + %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16> + %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32> + %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64> + %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1> + %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8> + %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16> + %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32> + %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64> + %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1> + %nxv1f16_nxv1i8 = fptosi undef to + %nxv1f16_nxv1i16 = fptosi undef to + %nxv1f16_nxv1i32 = fptosi undef to + %nxv1f16_nxv1i64 = fptosi undef to + %nxv1f16_nxv1i1 = fptosi undef to + %nxv2f16_nxv2i8 = fptosi undef to + %nxv2f16_nxv2i16 = fptosi undef to + %nxv2f16_nxv2i32 = fptosi undef to + %nxv2f16_nxv2i64 = fptosi undef to + %nxv2f16_nxv2i1 = fptosi undef to + %nxv4f16_nxv4i8 = fptosi undef to + %nxv4f16_nxv4i16 = fptosi undef to + %nxv4f16_nxv4i32 = fptosi undef to + %nxv4f16_nxv4i64 = fptosi undef to + %nxv4f16_nxv4i1 = fptosi undef to + %nxv8f16_nxv8i8 = fptosi undef to + %nxv8f16_nxv8i16 = fptosi undef to + %nxv8f16_nxv8i32 = fptosi undef to + %nxv8f16_nxv8i64 = fptosi undef to + %nxv8f16_nxv8i1 = fptosi undef to + %nxv16f16_nxv16i8 = fptosi undef to + %nxv16f16_nxv16i16 = fptosi undef to + %nxv16f16_nxv16i32 = fptosi undef to + %nxv16f16_nxv16i64 = fptosi undef to + %nxv16f16_nxv16i1 = fptosi undef to + %nxv32f16_nxv32i8 = fptosi undef to + %nxv32f16_nxv32i16 = fptosi undef to + %nxv32f16_nxv32i32 = fptosi undef to + %nxv32f16_nxv32i64 = fptosi undef to + %nxv32f16_nxv32i1 = fptosi undef to + %nxv64f16_nxv64i8 = fptosi undef to + %nxv64f16_nxv64i16 = fptosi undef to + %nxv64f16_nxv64i32 = fptosi undef to + %nxv64f16_nxv64i64 = fptosi undef to + %nxv64f16_nxv64i1 = fptosi undef to + ret void +} + +define void @fptoui() { +; RV32ZVFH-LABEL: 'fptoui' +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64f16_nxv64i64 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptoui undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; RV32ZVFHMIN-LABEL: 'fptoui' +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64f16_nxv64i64 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptoui undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; RV64ZVFH-LABEL: 'fptoui' +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64f16_nxv64i64 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptoui undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; RV64ZVFHMIN-LABEL: 'fptoui' +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64f16_nxv64i64 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptoui undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8> + %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16> + %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32> + %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64> + %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1> + %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8> + %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16> + %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32> + %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64> + %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1> + %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8> + %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16> + %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32> + %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64> + %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1> + %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8> + %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16> + %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32> + %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64> + %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1> + %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8> + %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16> + %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32> + %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64> + %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1> + %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8> + %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16> + %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32> + %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64> + %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1> + %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8> + %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16> + %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32> + %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64> + %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1> + %nxv1f16_nxv1i8 = fptoui undef to + %nxv1f16_nxv1i16 = fptoui undef to + %nxv1f16_nxv1i32 = fptoui undef to + %nxv1f16_nxv1i64 = fptoui undef to + %nxv1f16_nxv1i1 = fptoui undef to + %nxv2f16_nxv2i8 = fptoui undef to + %nxv2f16_nxv2i16 = fptoui undef to + %nxv2f16_nxv2i32 = fptoui undef to + %nxv2f16_nxv2i64 = fptoui undef to + %nxv2f16_nxv2i1 = fptoui undef to + %nxv4f16_nxv4i8 = fptoui undef to + %nxv4f16_nxv4i16 = fptoui undef to + %nxv4f16_nxv4i32 = fptoui undef to + %nxv4f16_nxv4i64 = fptoui undef to + %nxv4f16_nxv4i1 = fptoui undef to + %nxv8f16_nxv8i8 = fptoui undef to + %nxv8f16_nxv8i16 = fptoui undef to + %nxv8f16_nxv8i32 = fptoui undef to + %nxv8f16_nxv8i64 = fptoui undef to + %nxv8f16_nxv8i1 = fptoui undef to + %nxv16f16_nxv16i8 = fptoui undef to + %nxv16f16_nxv16i16 = fptoui undef to + %nxv16f16_nxv16i32 = fptoui undef to + %nxv16f16_nxv16i64 = fptoui undef to + %nxv16f16_nxv16i1 = fptoui undef to + %nxv32f16_nxv32i8 = fptoui undef to + %nxv32f16_nxv32i16 = fptoui undef to + %nxv32f16_nxv32i32 = fptoui undef to + %nxv32f16_nxv32i64 = fptoui undef to + %nxv32f16_nxv32i1 = fptoui undef to + %nxv64f16_nxv64i8 = fptoui undef to + %nxv64f16_nxv64i16 = fptoui undef to + %nxv64f16_nxv64i32 = fptoui undef to + %nxv64f16_nxv64i64 = fptoui undef to + %nxv64f16_nxv64i1 = fptoui undef to + ret void +} + +define void @sitofp() { +; RV32ZVFH-LABEL: 'sitofp' +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64i64_nxv64f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = sitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; RV32ZVFHMIN-LABEL: 'sitofp' +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64i64_nxv64f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = sitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; RV64ZVFH-LABEL: 'sitofp' +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = sitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; RV64ZVFHMIN-LABEL: 'sitofp' +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = sitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half> + %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half> + %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half> + %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half> + %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half> + %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half> + %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half> + %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half> + %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half> + %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half> + %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half> + %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half> + %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half> + %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half> + %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half> + %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half> + %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half> + %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half> + %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half> + %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half> + %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half> + %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half> + %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half> + %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half> + %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half> + %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half> + %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half> + %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half> + %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half> + %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half> + %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half> + %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half> + %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half> + %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half> + %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half> + %nxv1i8_nxv1f16 = sitofp undef to + %nxv1i16_nxv1f16 = sitofp undef to + %nxv1i32_nxv1f16 = sitofp undef to + %nxv1i64_nxv1f16 = sitofp undef to + %nxv1i1_nxv1f16 = sitofp undef to + %nxv2i8_nxv2f16 = sitofp undef to + %nxv2i16_nxv2f16 = sitofp undef to + %nxv2i32_nxv2f16 = sitofp undef to + %nxv2i64_nxv2f16 = sitofp undef to + %nxv2i1_nxv2f16 = sitofp undef to + %nxv4i8_nxv4f16 = sitofp undef to + %nxv4i16_nxv4f16 = sitofp undef to + %nxv4i32_nxv4f16 = sitofp undef to + %nxv4i64_nxv4f16 = sitofp undef to + %nxv4i1_nxv4f16 = sitofp undef to + %nxv8i8_nxv8f16 = sitofp undef to + %nxv8i16_nxv8f16 = sitofp undef to + %nxv8i32_nxv8f16 = sitofp undef to + %nxv8i64_nxv8f16 = sitofp undef to + %nxv8i1_nxv8f16 = sitofp undef to + %nxv16i8_nxv16f16 = sitofp undef to + %nxv16i16_nxv16f16 = sitofp undef to + %nxv16i32_nxv16f16 = sitofp undef to + %nxv16i64_nxv16f16 = sitofp undef to + %nxv16i1_nxv16f16 = sitofp undef to + %nxv32i8_nxv32f16 = sitofp undef to + %nxv32i16_nxv32f16 = sitofp undef to + %nxv32i32_nxv32f16 = sitofp undef to + %nxv32i64_nxv32f16 = sitofp undef to + %nxv32i1_nxv32f16 = sitofp undef to + %nxv64i8_nxv64f16 = sitofp undef to + %nxv64i16_nxv64f16 = sitofp undef to + %nxv64i32_nxv64f16 = sitofp undef to + %nxv64i64_nxv64f16 = sitofp undef to + %nxv64i1_nxv64f16 = sitofp undef to + ret void +} + +define void @uitofp() { +; RV32ZVFH-LABEL: 'uitofp' +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half> +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64i64_nxv64f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = uitofp undef to +; RV32ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; RV32ZVFHMIN-LABEL: 'uitofp' +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half> +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64i64_nxv64f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = uitofp undef to +; RV32ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; RV64ZVFH-LABEL: 'uitofp' +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half> +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = uitofp undef to +; RV64ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; RV64ZVFHMIN-LABEL: 'uitofp' +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half> +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = uitofp undef to +; RV64ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half> + %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half> + %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half> + %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half> + %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half> + %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half> + %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half> + %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half> + %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half> + %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half> + %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half> + %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half> + %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half> + %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half> + %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half> + %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half> + %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half> + %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half> + %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half> + %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half> + %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half> + %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half> + %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half> + %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half> + %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half> + %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half> + %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half> + %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half> + %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half> + %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half> + %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half> + %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half> + %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half> + %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half> + %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half> + %nxv1i8_nxv1f16 = uitofp undef to + %nxv1i16_nxv1f16 = uitofp undef to + %nxv1i32_nxv1f16 = uitofp undef to + %nxv1i64_nxv1f16 = uitofp undef to + %nxv1i1_nxv1f16 = uitofp undef to + %nxv2i8_nxv2f16 = uitofp undef to + %nxv2i16_nxv2f16 = uitofp undef to + %nxv2i32_nxv2f16 = uitofp undef to + %nxv2i64_nxv2f16 = uitofp undef to + %nxv2i1_nxv2f16 = uitofp undef to + %nxv4i8_nxv4f16 = uitofp undef to + %nxv4i16_nxv4f16 = uitofp undef to + %nxv4i32_nxv4f16 = uitofp undef to + %nxv4i64_nxv4f16 = uitofp undef to + %nxv4i1_nxv4f16 = uitofp undef to + %nxv8i8_nxv8f16 = uitofp undef to + %nxv8i16_nxv8f16 = uitofp undef to + %nxv8i32_nxv8f16 = uitofp undef to + %nxv8i64_nxv8f16 = uitofp undef to + %nxv8i1_nxv8f16 = uitofp undef to + %nxv16i8_nxv16f16 = uitofp undef to + %nxv16i16_nxv16f16 = uitofp undef to + %nxv16i32_nxv16f16 = uitofp undef to + %nxv16i64_nxv16f16 = uitofp undef to + %nxv16i1_nxv16f16 = uitofp undef to + %nxv32i8_nxv32f16 = uitofp undef to + %nxv32i16_nxv32f16 = uitofp undef to + %nxv32i32_nxv32f16 = uitofp undef to + %nxv32i64_nxv32f16 = uitofp undef to + %nxv32i1_nxv32f16 = uitofp undef to + %nxv64i8_nxv64f16 = uitofp undef to + %nxv64i16_nxv64f16 = uitofp undef to + %nxv64i32_nxv64f16 = uitofp undef to + %nxv64i64_nxv64f16 = uitofp undef to + %nxv64i1_nxv64f16 = uitofp undef to + ret void +} diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll index e90fab9fbc8c46..ccc9101e7b0cdd 100644 --- a/llvm/test/Analysis/CostModel/RISCV/cast.ll +++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll @@ -1718,652 +1718,442 @@ define void @fptrunc() { define void @fptosi() { ; RV32-LABEL: 'fptosi' -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i64 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i8 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i16 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i32 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i32 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i64 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i1 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i8 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i16 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i16 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i32 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i32 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i64 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i1 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i8 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i16 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f64_nxv16i16 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i32 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f64_nxv16i32 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i1 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv32f32_nxv32i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i8 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f32_nxv32i16 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f64_nxv32i16 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32f64_nxv32i32 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32f32_nxv32i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv32f32_nxv32i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i1 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv64f32_nxv64i8 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i8 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f32_nxv64i16 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64f64_nxv64i16 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv64f64_nxv64i32 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64f16_nxv64i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv64f32_nxv64i64 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv64f64_nxv64i64 = fptosi undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv64f32_nxv64i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i1 = fptosi undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64-LABEL: 'fptosi' -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i64 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i8 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i16 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i32 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i32 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i64 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i1 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i8 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i16 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i16 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i32 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i32 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i64 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i1 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i8 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i16 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f64_nxv16i16 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i32 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f64_nxv16i32 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i1 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv32f32_nxv32i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i8 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f32_nxv32i16 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f64_nxv32i16 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32f64_nxv32i32 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32f32_nxv32i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv32f32_nxv32i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i1 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv64f32_nxv64i8 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i8 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f32_nxv64i16 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64f64_nxv64i16 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv64f64_nxv64i32 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64f16_nxv64i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv64f32_nxv64i64 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptosi undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv64f32_nxv64i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i1 = fptosi undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8> %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8> %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8> - %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16> %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16> %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16> - %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32> %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32> %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32> - %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64> %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64> %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64> - %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1> %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1> %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1> - %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8> %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8> %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8> - %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16> %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16> %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16> - %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32> %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32> %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32> - %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64> %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64> %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64> - %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1> %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1> %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1> - %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8> %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8> %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8> - %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16> %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16> %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16> - %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32> %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32> %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32> - %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64> %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64> %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64> - %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1> %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1> %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1> - %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8> %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8> %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8> - %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16> %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16> %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16> - %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32> %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32> %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32> - %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64> %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64> %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64> - %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1> %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1> %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1> - %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8> %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8> %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8> - %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16> %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16> %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16> - %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32> %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32> %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32> - %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64> %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64> %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64> - %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1> %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1> %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1> - %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8> %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8> %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8> - %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16> %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16> %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16> - %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32> %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32> %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32> - %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64> %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64> %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64> - %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1> %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1> %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1> - %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8> %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8> %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8> - %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16> %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16> %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16> - %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32> %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32> %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32> - %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64> %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64> %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64> - %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1> %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1> %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1> - %nxv1f16_nxv1i8 = fptosi undef to %nxv1f32_nxv1i8 = fptosi undef to %nxv1f64_nxv1i8 = fptosi undef to - %nxv1f16_nxv1i16 = fptosi undef to %nxv1f32_nxv1i16 = fptosi undef to %nxv1f64_nxv1i16 = fptosi undef to - %nxv1f16_nxv1i32 = fptosi undef to %nxv1f32_nxv1i32 = fptosi undef to %nxv1f64_nxv1i32 = fptosi undef to - %nxv1f16_nxv1i64 = fptosi undef to %nxv1f32_nxv1i64 = fptosi undef to %nxv1f64_nxv1i64 = fptosi undef to - %nxv1f16_nxv1i1 = fptosi undef to %nxv1f32_nxv1i1 = fptosi undef to %nxv1f64_nxv1i1 = fptosi undef to - %nxv2f16_nxv2i8 = fptosi undef to %nxv2f32_nxv2i8 = fptosi undef to %nxv2f64_nxv2i8 = fptosi undef to - %nxv2f16_nxv2i16 = fptosi undef to %nxv2f32_nxv2i16 = fptosi undef to %nxv2f64_nxv2i16 = fptosi undef to - %nxv2f16_nxv2i32 = fptosi undef to %nxv2f32_nxv2i32 = fptosi undef to %nxv2f64_nxv2i32 = fptosi undef to - %nxv2f16_nxv2i64 = fptosi undef to %nxv2f32_nxv2i64 = fptosi undef to %nxv2f64_nxv2i64 = fptosi undef to - %nxv2f16_nxv2i1 = fptosi undef to %nxv2f32_nxv2i1 = fptosi undef to %nxv2f64_nxv2i1 = fptosi undef to - %nxv4f16_nxv4i8 = fptosi undef to %nxv4f32_nxv4i8 = fptosi undef to %nxv4f64_nxv4i8 = fptosi undef to - %nxv4f16_nxv4i16 = fptosi undef to %nxv4f32_nxv4i16 = fptosi undef to %nxv4f64_nxv4i16 = fptosi undef to - %nxv4f16_nxv4i32 = fptosi undef to %nxv4f32_nxv4i32 = fptosi undef to %nxv4f64_nxv4i32 = fptosi undef to - %nxv4f16_nxv4i64 = fptosi undef to %nxv4f32_nxv4i64 = fptosi undef to %nxv4f64_nxv4i64 = fptosi undef to - %nxv4f16_nxv4i1 = fptosi undef to %nxv4f32_nxv4i1 = fptosi undef to %nxv4f64_nxv4i1 = fptosi undef to - %nxv8f16_nxv8i8 = fptosi undef to %nxv8f32_nxv8i8 = fptosi undef to %nxv8f64_nxv8i8 = fptosi undef to - %nxv8f16_nxv8i16 = fptosi undef to %nxv8f32_nxv8i16 = fptosi undef to %nxv8f64_nxv8i16 = fptosi undef to - %nxv8f16_nxv8i32 = fptosi undef to %nxv8f32_nxv8i32 = fptosi undef to %nxv8f64_nxv8i32 = fptosi undef to - %nxv8f16_nxv8i64 = fptosi undef to %nxv8f32_nxv8i64 = fptosi undef to %nxv8f64_nxv8i64 = fptosi undef to - %nxv8f16_nxv8i1 = fptosi undef to %nxv8f32_nxv8i1 = fptosi undef to %nxv8f64_nxv8i1 = fptosi undef to - %nxv16f16_nxv16i8 = fptosi undef to %nxv16f32_nxv16i8 = fptosi undef to %nxv16f64_nxv16i8 = fptosi undef to - %nxv16f16_nxv16i16 = fptosi undef to %nxv16f32_nxv16i16 = fptosi undef to %nxv16f64_nxv16i16 = fptosi undef to - %nxv16f16_nxv16i32 = fptosi undef to %nxv16f32_nxv16i32 = fptosi undef to %nxv16f64_nxv16i32 = fptosi undef to - %nxv16f16_nxv16i64 = fptosi undef to %nxv16f32_nxv16i64 = fptosi undef to %nxv16f64_nxv16i64 = fptosi undef to - %nxv16f16_nxv16i1 = fptosi undef to %nxv16f32_nxv16i1 = fptosi undef to %nxv16f64_nxv16i1 = fptosi undef to - %nxv32f16_nxv32i8 = fptosi undef to %nxv32f32_nxv32i8 = fptosi undef to %nxv32f64_nxv32i8 = fptosi undef to - %nxv32f16_nxv32i16 = fptosi undef to %nxv32f32_nxv32i16 = fptosi undef to %nxv32f64_nxv32i16 = fptosi undef to - %nxv32f16_nxv32i32 = fptosi undef to %nxv32f32_nxv32i32 = fptosi undef to %nxv32f64_nxv32i32 = fptosi undef to - %nxv32f16_nxv32i64 = fptosi undef to %nxv32f32_nxv32i64 = fptosi undef to %nxv32f64_nxv32i64 = fptosi undef to - %nxv32f16_nxv32i1 = fptosi undef to %nxv32f32_nxv32i1 = fptosi undef to %nxv32f64_nxv32i1 = fptosi undef to - %nxv64f16_nxv64i8 = fptosi undef to %nxv64f32_nxv64i8 = fptosi undef to %nxv64f64_nxv64i8 = fptosi undef to - %nxv64f16_nxv64i16 = fptosi undef to %nxv64f32_nxv64i16 = fptosi undef to %nxv64f64_nxv64i16 = fptosi undef to - %nxv64f16_nxv64i32 = fptosi undef to %nxv64f32_nxv64i32 = fptosi undef to %nxv64f64_nxv64i32 = fptosi undef to - %nxv64f16_nxv64i64 = fptosi undef to %nxv64f32_nxv64i64 = fptosi undef to %nxv64f64_nxv64i64 = fptosi undef to - %nxv64f16_nxv64i1 = fptosi undef to %nxv64f32_nxv64i1 = fptosi undef to %nxv64f64_nxv64i1 = fptosi undef to @@ -2372,652 +2162,442 @@ define void @fptosi() { define void @fptoui() { ; RV32-LABEL: 'fptoui' -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16> ; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16> -; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32> -; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64> ; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64> -; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1> ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i64 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i8 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i16 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i32 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i32 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i64 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i1 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i8 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i16 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i16 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i32 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i32 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i64 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i1 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i8 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i16 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f64_nxv16i16 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i32 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f64_nxv16i32 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i1 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv32f32_nxv32i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i8 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f32_nxv32i16 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f64_nxv32i16 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32f64_nxv32i32 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32f32_nxv32i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv32f32_nxv32i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i1 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv64f32_nxv64i8 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i8 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f32_nxv64i16 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64f64_nxv64i16 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv64f64_nxv64i32 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64f16_nxv64i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv64f32_nxv64i64 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %nxv64f64_nxv64i64 = fptoui undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv64f32_nxv64i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i1 = fptoui undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64-LABEL: 'fptoui' -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16> ; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16> -; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32> -; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64> ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64> -; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1> ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i64 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i8 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i16 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i32 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i32 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i64 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i1 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i8 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i16 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i16 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i32 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i32 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i64 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i1 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i8 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i16 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f64_nxv16i16 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i32 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f64_nxv16i32 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i1 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv32f32_nxv32i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i8 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f32_nxv32i16 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f64_nxv32i16 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32f64_nxv32i32 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32f32_nxv32i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv32f32_nxv32i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i1 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv64f32_nxv64i8 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i8 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f32_nxv64i16 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64f64_nxv64i16 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv64f64_nxv64i32 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64f16_nxv64i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv64f32_nxv64i64 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptoui undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv64f32_nxv64i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i1 = fptoui undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8> %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8> %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8> - %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16> %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16> %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16> - %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32> %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32> %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32> - %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64> %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64> %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64> - %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1> %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1> %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1> - %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8> %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8> %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8> - %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16> %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16> %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16> - %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32> %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32> %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32> - %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64> %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64> %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64> - %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1> %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1> %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1> - %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8> %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8> %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8> - %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16> %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16> %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16> - %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32> %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32> %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32> - %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64> %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64> %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64> - %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1> %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1> %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1> - %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8> %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8> %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8> - %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16> %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16> %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16> - %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32> %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32> %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32> - %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64> %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64> %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64> - %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1> %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1> %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1> - %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8> %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8> %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8> - %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16> %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16> %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16> - %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32> %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32> %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32> - %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64> %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64> %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64> - %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1> %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1> %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1> - %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8> %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8> %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8> - %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16> %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16> %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16> - %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32> %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32> %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32> - %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64> %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64> %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64> - %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1> %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1> %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1> - %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8> %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8> %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8> - %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16> %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16> %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16> - %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32> %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32> %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32> - %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64> %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64> %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64> - %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1> %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1> %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1> - %nxv1f16_nxv1i8 = fptoui undef to %nxv1f32_nxv1i8 = fptoui undef to %nxv1f64_nxv1i8 = fptoui undef to - %nxv1f16_nxv1i16 = fptoui undef to %nxv1f32_nxv1i16 = fptoui undef to %nxv1f64_nxv1i16 = fptoui undef to - %nxv1f16_nxv1i32 = fptoui undef to %nxv1f32_nxv1i32 = fptoui undef to %nxv1f64_nxv1i32 = fptoui undef to - %nxv1f16_nxv1i64 = fptoui undef to %nxv1f32_nxv1i64 = fptoui undef to %nxv1f64_nxv1i64 = fptoui undef to - %nxv1f16_nxv1i1 = fptoui undef to %nxv1f32_nxv1i1 = fptoui undef to %nxv1f64_nxv1i1 = fptoui undef to - %nxv2f16_nxv2i8 = fptoui undef to %nxv2f32_nxv2i8 = fptoui undef to %nxv2f64_nxv2i8 = fptoui undef to - %nxv2f16_nxv2i16 = fptoui undef to %nxv2f32_nxv2i16 = fptoui undef to %nxv2f64_nxv2i16 = fptoui undef to - %nxv2f16_nxv2i32 = fptoui undef to %nxv2f32_nxv2i32 = fptoui undef to %nxv2f64_nxv2i32 = fptoui undef to - %nxv2f16_nxv2i64 = fptoui undef to %nxv2f32_nxv2i64 = fptoui undef to %nxv2f64_nxv2i64 = fptoui undef to - %nxv2f16_nxv2i1 = fptoui undef to %nxv2f32_nxv2i1 = fptoui undef to %nxv2f64_nxv2i1 = fptoui undef to - %nxv4f16_nxv4i8 = fptoui undef to %nxv4f32_nxv4i8 = fptoui undef to %nxv4f64_nxv4i8 = fptoui undef to - %nxv4f16_nxv4i16 = fptoui undef to %nxv4f32_nxv4i16 = fptoui undef to %nxv4f64_nxv4i16 = fptoui undef to - %nxv4f16_nxv4i32 = fptoui undef to %nxv4f32_nxv4i32 = fptoui undef to %nxv4f64_nxv4i32 = fptoui undef to - %nxv4f16_nxv4i64 = fptoui undef to %nxv4f32_nxv4i64 = fptoui undef to %nxv4f64_nxv4i64 = fptoui undef to - %nxv4f16_nxv4i1 = fptoui undef to %nxv4f32_nxv4i1 = fptoui undef to %nxv4f64_nxv4i1 = fptoui undef to - %nxv8f16_nxv8i8 = fptoui undef to %nxv8f32_nxv8i8 = fptoui undef to %nxv8f64_nxv8i8 = fptoui undef to - %nxv8f16_nxv8i16 = fptoui undef to %nxv8f32_nxv8i16 = fptoui undef to %nxv8f64_nxv8i16 = fptoui undef to - %nxv8f16_nxv8i32 = fptoui undef to %nxv8f32_nxv8i32 = fptoui undef to %nxv8f64_nxv8i32 = fptoui undef to - %nxv8f16_nxv8i64 = fptoui undef to %nxv8f32_nxv8i64 = fptoui undef to %nxv8f64_nxv8i64 = fptoui undef to - %nxv8f16_nxv8i1 = fptoui undef to %nxv8f32_nxv8i1 = fptoui undef to %nxv8f64_nxv8i1 = fptoui undef to - %nxv16f16_nxv16i8 = fptoui undef to %nxv16f32_nxv16i8 = fptoui undef to %nxv16f64_nxv16i8 = fptoui undef to - %nxv16f16_nxv16i16 = fptoui undef to %nxv16f32_nxv16i16 = fptoui undef to %nxv16f64_nxv16i16 = fptoui undef to - %nxv16f16_nxv16i32 = fptoui undef to %nxv16f32_nxv16i32 = fptoui undef to %nxv16f64_nxv16i32 = fptoui undef to - %nxv16f16_nxv16i64 = fptoui undef to %nxv16f32_nxv16i64 = fptoui undef to %nxv16f64_nxv16i64 = fptoui undef to - %nxv16f16_nxv16i1 = fptoui undef to %nxv16f32_nxv16i1 = fptoui undef to %nxv16f64_nxv16i1 = fptoui undef to - %nxv32f16_nxv32i8 = fptoui undef to %nxv32f32_nxv32i8 = fptoui undef to %nxv32f64_nxv32i8 = fptoui undef to - %nxv32f16_nxv32i16 = fptoui undef to %nxv32f32_nxv32i16 = fptoui undef to %nxv32f64_nxv32i16 = fptoui undef to - %nxv32f16_nxv32i32 = fptoui undef to %nxv32f32_nxv32i32 = fptoui undef to %nxv32f64_nxv32i32 = fptoui undef to - %nxv32f16_nxv32i64 = fptoui undef to %nxv32f32_nxv32i64 = fptoui undef to %nxv32f64_nxv32i64 = fptoui undef to - %nxv32f16_nxv32i1 = fptoui undef to %nxv32f32_nxv32i1 = fptoui undef to %nxv32f64_nxv32i1 = fptoui undef to - %nxv64f16_nxv64i8 = fptoui undef to %nxv64f32_nxv64i8 = fptoui undef to %nxv64f64_nxv64i8 = fptoui undef to - %nxv64f16_nxv64i16 = fptoui undef to %nxv64f32_nxv64i16 = fptoui undef to %nxv64f64_nxv64i16 = fptoui undef to - %nxv64f16_nxv64i32 = fptoui undef to %nxv64f32_nxv64i32 = fptoui undef to %nxv64f64_nxv64i32 = fptoui undef to - %nxv64f16_nxv64i64 = fptoui undef to %nxv64f32_nxv64i64 = fptoui undef to %nxv64f64_nxv64i64 = fptoui undef to - %nxv64f16_nxv64i1 = fptoui undef to %nxv64f32_nxv64i1 = fptoui undef to %nxv64f64_nxv64i1 = fptoui undef to @@ -3026,652 +2606,442 @@ define void @fptoui() { define void @sitofp() { ; RV32-LABEL: 'sitofp' -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = sitofp <2 x i16> undef to <2 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i16> undef to <4 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f64 = sitofp <8 x i16> undef to <8 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f64 = sitofp <16 x i16> undef to <16 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f64 = sitofp <32 x i16> undef to <32 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f64 = sitofp <64 x i16> undef to <64 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f64 = sitofp <128 x i16> undef to <128 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64_nxv4f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_nxv8f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64_nxv8f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_nxv16f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i8_nxv16f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i16_nxv16f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_nxv16f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_nxv16f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv32i8_nxv32f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i8_nxv32f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i16_nxv32f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32i64_nxv32f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_nxv32f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv32i1_nxv32f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv64i8_nxv64f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64i8_nxv64f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64i16_nxv64f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv64i32_nxv64f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64i64_nxv64f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv64i64_nxv64f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f64 = sitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv64i1_nxv64f32 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv64i1_nxv64f64 = sitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64-LABEL: 'sitofp' -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = sitofp <2 x i16> undef to <2 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i16> undef to <4 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f64 = sitofp <8 x i16> undef to <8 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f64 = sitofp <16 x i16> undef to <16 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f64 = sitofp <32 x i16> undef to <32 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f64 = sitofp <64 x i16> undef to <64 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f64 = sitofp <128 x i16> undef to <128 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64_nxv4f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_nxv8f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64_nxv8f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_nxv16f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i8_nxv16f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i16_nxv16f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_nxv16f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_nxv16f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv32i8_nxv32f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i8_nxv32f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i16_nxv32f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32i64_nxv32f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_nxv32f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv32i1_nxv32f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv64i8_nxv64f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64i8_nxv64f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64i16_nxv64f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv64i32_nxv64f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv64i64_nxv64f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f64 = sitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv64i1_nxv64f32 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv64i1_nxv64f64 = sitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half> %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float> %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double> - %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half> %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float> %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double> - %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half> %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float> %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double> - %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half> %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float> %v2i64_v2f64 = sitofp <2 x i16> undef to <2 x double> - %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half> %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float> %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double> - %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half> %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> - %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half> %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float> %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double> - %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half> %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float> %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double> - %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half> %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float> %v4i64_v4f64 = sitofp <4 x i16> undef to <4 x double> - %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half> %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float> %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double> - %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half> %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> - %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half> %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float> %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double> - %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half> %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float> %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double> - %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half> %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float> %v8i64_v8f64 = sitofp <8 x i16> undef to <8 x double> - %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half> %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float> %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double> - %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half> %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double> - %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half> %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float> %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double> - %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half> %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float> %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double> - %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half> %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float> %v16i64_v16f64 = sitofp <16 x i16> undef to <16 x double> - %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half> %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float> %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double> - %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half> %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float> %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double> - %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half> %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float> %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double> - %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half> %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float> %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double> - %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half> %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float> %v32i64_v32f64 = sitofp <32 x i16> undef to <32 x double> - %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half> %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float> %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double> - %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half> %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float> %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double> - %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half> %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float> %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double> - %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half> %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float> %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double> - %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half> %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float> %v64i64_v64f64 = sitofp <64 x i16> undef to <64 x double> - %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half> %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float> %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double> - %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half> %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float> %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double> - %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half> %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float> %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double> - %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half> %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float> %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double> - %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half> %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float> %v128i64_v128f64 = sitofp <128 x i16> undef to <128 x double> - %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half> %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float> %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double> - %nxv1i8_nxv1f16 = sitofp undef to %nxv1i8_nxv1f32 = sitofp undef to %nxv1i8_nxv1f64 = sitofp undef to - %nxv1i16_nxv1f16 = sitofp undef to %nxv1i16_nxv1f32 = sitofp undef to %nxv1i16_nxv1f64 = sitofp undef to - %nxv1i32_nxv1f16 = sitofp undef to %nxv1i32_nxv1f32 = sitofp undef to %nxv1i32_nxv1f64 = sitofp undef to - %nxv1i64_nxv1f16 = sitofp undef to %nxv1i64_nxv1f32 = sitofp undef to %nxv1i64_nxv1f64 = sitofp undef to - %nxv1i1_nxv1f16 = sitofp undef to %nxv1i1_nxv1f32 = sitofp undef to %nxv1i1_nxv1f64 = sitofp undef to - %nxv2i8_nxv2f16 = sitofp undef to %nxv2i8_nxv2f32 = sitofp undef to %nxv2i8_nxv2f64 = sitofp undef to - %nxv2i16_nxv2f16 = sitofp undef to %nxv2i16_nxv2f32 = sitofp undef to %nxv2i16_nxv2f64 = sitofp undef to - %nxv2i32_nxv2f16 = sitofp undef to %nxv2i32_nxv2f32 = sitofp undef to %nxv2i32_nxv2f64 = sitofp undef to - %nxv2i64_nxv2f16 = sitofp undef to %nxv2i64_nxv2f32 = sitofp undef to %nxv2i64_nxv2f64 = sitofp undef to - %nxv2i1_nxv2f16 = sitofp undef to %nxv2i1_nxv2f32 = sitofp undef to %nxv2i1_nxv2f64 = sitofp undef to - %nxv4i8_nxv4f16 = sitofp undef to %nxv4i8_nxv4f32 = sitofp undef to %nxv4i8_nxv4f64 = sitofp undef to - %nxv4i16_nxv4f16 = sitofp undef to %nxv4i16_nxv4f32 = sitofp undef to %nxv4i16_nxv4f64 = sitofp undef to - %nxv4i32_nxv4f16 = sitofp undef to %nxv4i32_nxv4f32 = sitofp undef to %nxv4i32_nxv4f64 = sitofp undef to - %nxv4i64_nxv4f16 = sitofp undef to %nxv4i64_nxv4f32 = sitofp undef to %nxv4i64_nxv4f64 = sitofp undef to - %nxv4i1_nxv4f16 = sitofp undef to %nxv4i1_nxv4f32 = sitofp undef to %nxv4i1_nxv4f64 = sitofp undef to - %nxv8i8_nxv8f16 = sitofp undef to %nxv8i8_nxv8f32 = sitofp undef to %nxv8i8_nxv8f64 = sitofp undef to - %nxv8i16_nxv8f16 = sitofp undef to %nxv8i16_nxv8f32 = sitofp undef to %nxv8i16_nxv8f64 = sitofp undef to - %nxv8i32_nxv8f16 = sitofp undef to %nxv8i32_nxv8f32 = sitofp undef to %nxv8i32_nxv8f64 = sitofp undef to - %nxv8i64_nxv8f16 = sitofp undef to %nxv8i64_nxv8f32 = sitofp undef to %nxv8i64_nxv8f64 = sitofp undef to - %nxv8i1_nxv8f16 = sitofp undef to %nxv8i1_nxv8f32 = sitofp undef to %nxv8i1_nxv8f64 = sitofp undef to - %nxv16i8_nxv16f16 = sitofp undef to %nxv16i8_nxv16f32 = sitofp undef to %nxv16i8_nxv16f64 = sitofp undef to - %nxv16i16_nxv16f16 = sitofp undef to %nxv16i16_nxv16f32 = sitofp undef to %nxv16i16_nxv16f64 = sitofp undef to - %nxv16i32_nxv16f16 = sitofp undef to %nxv16i32_nxv16f32 = sitofp undef to %nxv16i32_nxv16f64 = sitofp undef to - %nxv16i64_nxv16f16 = sitofp undef to %nxv16i64_nxv16f32 = sitofp undef to %nxv16i64_nxv16f64 = sitofp undef to - %nxv16i1_nxv16f16 = sitofp undef to %nxv16i1_nxv16f32 = sitofp undef to %nxv16i1_nxv16f64 = sitofp undef to - %nxv32i8_nxv32f16 = sitofp undef to %nxv32i8_nxv32f32 = sitofp undef to %nxv32i8_nxv32f64 = sitofp undef to - %nxv32i16_nxv32f16 = sitofp undef to %nxv32i16_nxv32f32 = sitofp undef to %nxv32i16_nxv32f64 = sitofp undef to - %nxv32i32_nxv32f16 = sitofp undef to %nxv32i32_nxv32f32 = sitofp undef to %nxv32i32_nxv32f64 = sitofp undef to - %nxv32i64_nxv32f16 = sitofp undef to %nxv32i64_nxv32f32 = sitofp undef to %nxv32i64_nxv32f64 = sitofp undef to - %nxv32i1_nxv32f16 = sitofp undef to %nxv32i1_nxv32f32 = sitofp undef to %nxv32i1_nxv32f64 = sitofp undef to - %nxv64i8_nxv64f16 = sitofp undef to %nxv64i8_nxv64f32 = sitofp undef to %nxv64i8_nxv64f64 = sitofp undef to - %nxv64i16_nxv64f16 = sitofp undef to %nxv64i16_nxv64f32 = sitofp undef to %nxv64i16_nxv64f64 = sitofp undef to - %nxv64i32_nxv64f16 = sitofp undef to %nxv64i32_nxv64f32 = sitofp undef to %nxv64i32_nxv64f64 = sitofp undef to - %nxv64i64_nxv64f16 = sitofp undef to %nxv64i64_nxv64f32 = sitofp undef to %nxv64i64_nxv64f64 = sitofp undef to - %nxv64i1_nxv64f16 = sitofp undef to %nxv64i1_nxv64f32 = sitofp undef to %nxv64i1_nxv64f64 = sitofp undef to @@ -3680,652 +3050,442 @@ define void @sitofp() { define void @uitofp() { ; RV32-LABEL: 'uitofp' -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = uitofp <2 x i16> undef to <2 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i16> undef to <4 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f64 = uitofp <8 x i16> undef to <8 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f64 = uitofp <16 x i16> undef to <16 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f64 = uitofp <32 x i16> undef to <32 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f64 = uitofp <64 x i16> undef to <64 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f64 = uitofp <128 x i16> undef to <128 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half> ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float> ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double> -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64_nxv4f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_nxv8f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64_nxv8f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_nxv16f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i8_nxv16f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i16_nxv16f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_nxv16f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_nxv16f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv32i8_nxv32f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i8_nxv32f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i16_nxv32f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32i64_nxv32f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_nxv32f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv32i1_nxv32f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv64i8_nxv64f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64i8_nxv64f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64i16_nxv64f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv64i32_nxv64f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64i64_nxv64f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %nxv64i64_nxv64f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f64 = uitofp undef to -; RV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv64i1_nxv64f32 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv64i1_nxv64f64 = uitofp undef to ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64-LABEL: 'uitofp' -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = uitofp <2 x i16> undef to <2 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i16> undef to <4 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f64 = uitofp <8 x i16> undef to <8 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f64 = uitofp <16 x i16> undef to <16 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f64 = uitofp <32 x i16> undef to <32 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f64 = uitofp <64 x i16> undef to <64 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f64 = uitofp <128 x i16> undef to <128 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half> ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float> ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double> -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64_nxv4f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_nxv8f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64_nxv8f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_nxv16f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i8_nxv16f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i16_nxv16f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_nxv16f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_nxv16f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %nxv32i8_nxv32f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i8_nxv32f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i16_nxv32f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv32i64_nxv32f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_nxv32f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv32i1_nxv32f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %nxv64i8_nxv64f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %nxv64i8_nxv64f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64i16_nxv64f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv64i32_nxv64f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nxv64i64_nxv64f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f64 = uitofp undef to -; RV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %nxv64i1_nxv64f32 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %nxv64i1_nxv64f64 = uitofp undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half> %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float> %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double> - %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half> %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float> %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double> - %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half> %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float> %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double> - %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half> %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float> %v2i64_v2f64 = uitofp <2 x i16> undef to <2 x double> - %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half> %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float> %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double> - %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half> %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> - %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half> %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float> %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double> - %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half> %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float> %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double> - %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half> %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float> %v4i64_v4f64 = uitofp <4 x i16> undef to <4 x double> - %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half> %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float> %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double> - %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half> %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> - %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half> %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float> %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double> - %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half> %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float> %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double> - %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half> %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float> %v8i64_v8f64 = uitofp <8 x i16> undef to <8 x double> - %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half> %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float> %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double> - %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half> %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double> - %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half> %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float> %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double> - %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half> %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float> %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double> - %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half> %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float> %v16i64_v16f64 = uitofp <16 x i16> undef to <16 x double> - %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half> %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float> %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double> - %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half> %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float> %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double> - %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half> %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float> %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double> - %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half> %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float> %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double> - %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half> %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float> %v32i64_v32f64 = uitofp <32 x i16> undef to <32 x double> - %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half> %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float> %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double> - %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half> %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float> %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double> - %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half> %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float> %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double> - %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half> %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float> %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double> - %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half> %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float> %v64i64_v64f64 = uitofp <64 x i16> undef to <64 x double> - %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half> %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float> %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double> - %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half> %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float> %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double> - %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half> %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float> %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double> - %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half> %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float> %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double> - %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half> %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float> %v128i64_v128f64 = uitofp <128 x i16> undef to <128 x double> - %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half> %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float> %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double> - %nxv1i8_nxv1f16 = uitofp undef to %nxv1i8_nxv1f32 = uitofp undef to %nxv1i8_nxv1f64 = uitofp undef to - %nxv1i16_nxv1f16 = uitofp undef to %nxv1i16_nxv1f32 = uitofp undef to %nxv1i16_nxv1f64 = uitofp undef to - %nxv1i32_nxv1f16 = uitofp undef to %nxv1i32_nxv1f32 = uitofp undef to %nxv1i32_nxv1f64 = uitofp undef to - %nxv1i64_nxv1f16 = uitofp undef to %nxv1i64_nxv1f32 = uitofp undef to %nxv1i64_nxv1f64 = uitofp undef to - %nxv1i1_nxv1f16 = uitofp undef to %nxv1i1_nxv1f32 = uitofp undef to %nxv1i1_nxv1f64 = uitofp undef to - %nxv2i8_nxv2f16 = uitofp undef to %nxv2i8_nxv2f32 = uitofp undef to %nxv2i8_nxv2f64 = uitofp undef to - %nxv2i16_nxv2f16 = uitofp undef to %nxv2i16_nxv2f32 = uitofp undef to %nxv2i16_nxv2f64 = uitofp undef to - %nxv2i32_nxv2f16 = uitofp undef to %nxv2i32_nxv2f32 = uitofp undef to %nxv2i32_nxv2f64 = uitofp undef to - %nxv2i64_nxv2f16 = uitofp undef to %nxv2i64_nxv2f32 = uitofp undef to %nxv2i64_nxv2f64 = uitofp undef to - %nxv2i1_nxv2f16 = uitofp undef to %nxv2i1_nxv2f32 = uitofp undef to %nxv2i1_nxv2f64 = uitofp undef to - %nxv4i8_nxv4f16 = uitofp undef to %nxv4i8_nxv4f32 = uitofp undef to %nxv4i8_nxv4f64 = uitofp undef to - %nxv4i16_nxv4f16 = uitofp undef to %nxv4i16_nxv4f32 = uitofp undef to %nxv4i16_nxv4f64 = uitofp undef to - %nxv4i32_nxv4f16 = uitofp undef to %nxv4i32_nxv4f32 = uitofp undef to %nxv4i32_nxv4f64 = uitofp undef to - %nxv4i64_nxv4f16 = uitofp undef to %nxv4i64_nxv4f32 = uitofp undef to %nxv4i64_nxv4f64 = uitofp undef to - %nxv4i1_nxv4f16 = uitofp undef to %nxv4i1_nxv4f32 = uitofp undef to %nxv4i1_nxv4f64 = uitofp undef to - %nxv8i8_nxv8f16 = uitofp undef to %nxv8i8_nxv8f32 = uitofp undef to %nxv8i8_nxv8f64 = uitofp undef to - %nxv8i16_nxv8f16 = uitofp undef to %nxv8i16_nxv8f32 = uitofp undef to %nxv8i16_nxv8f64 = uitofp undef to - %nxv8i32_nxv8f16 = uitofp undef to %nxv8i32_nxv8f32 = uitofp undef to %nxv8i32_nxv8f64 = uitofp undef to - %nxv8i64_nxv8f16 = uitofp undef to %nxv8i64_nxv8f32 = uitofp undef to %nxv8i64_nxv8f64 = uitofp undef to - %nxv8i1_nxv8f16 = uitofp undef to %nxv8i1_nxv8f32 = uitofp undef to %nxv8i1_nxv8f64 = uitofp undef to - %nxv16i8_nxv16f16 = uitofp undef to %nxv16i8_nxv16f32 = uitofp undef to %nxv16i8_nxv16f64 = uitofp undef to - %nxv16i16_nxv16f16 = uitofp undef to %nxv16i16_nxv16f32 = uitofp undef to %nxv16i16_nxv16f64 = uitofp undef to - %nxv16i32_nxv16f16 = uitofp undef to %nxv16i32_nxv16f32 = uitofp undef to %nxv16i32_nxv16f64 = uitofp undef to - %nxv16i64_nxv16f16 = uitofp undef to %nxv16i64_nxv16f32 = uitofp undef to %nxv16i64_nxv16f64 = uitofp undef to - %nxv16i1_nxv16f16 = uitofp undef to %nxv16i1_nxv16f32 = uitofp undef to %nxv16i1_nxv16f64 = uitofp undef to - %nxv32i8_nxv32f16 = uitofp undef to %nxv32i8_nxv32f32 = uitofp undef to %nxv32i8_nxv32f64 = uitofp undef to - %nxv32i16_nxv32f16 = uitofp undef to %nxv32i16_nxv32f32 = uitofp undef to %nxv32i16_nxv32f64 = uitofp undef to - %nxv32i32_nxv32f16 = uitofp undef to %nxv32i32_nxv32f32 = uitofp undef to %nxv32i32_nxv32f64 = uitofp undef to - %nxv32i64_nxv32f16 = uitofp undef to %nxv32i64_nxv32f32 = uitofp undef to %nxv32i64_nxv32f64 = uitofp undef to - %nxv32i1_nxv32f16 = uitofp undef to %nxv32i1_nxv32f32 = uitofp undef to %nxv32i1_nxv32f64 = uitofp undef to - %nxv64i8_nxv64f16 = uitofp undef to %nxv64i8_nxv64f32 = uitofp undef to %nxv64i8_nxv64f64 = uitofp undef to - %nxv64i16_nxv64f16 = uitofp undef to %nxv64i16_nxv64f32 = uitofp undef to %nxv64i16_nxv64f64 = uitofp undef to - %nxv64i32_nxv64f16 = uitofp undef to %nxv64i32_nxv64f32 = uitofp undef to %nxv64i32_nxv64f64 = uitofp undef to - %nxv64i64_nxv64f16 = uitofp undef to %nxv64i64_nxv64f32 = uitofp undef to %nxv64i64_nxv64f64 = uitofp undef to - %nxv64i1_nxv64f16 = uitofp undef to %nxv64i1_nxv64f32 = uitofp undef to %nxv64i1_nxv64f64 = uitofp undef to From b294951e3967730ffad14d51297694b1411d7af6 Mon Sep 17 00:00:00 2001 From: yronglin Date: Fri, 30 Aug 2024 17:34:00 +0800 Subject: [PATCH 06/98] [clang][bytecode] Fix the handling of address of a vector (#106558) The PR https://github.com/llvm/llvm-project/pull/105996 broke taking the address of a vector: **compound-literal.c** ```C typedef int v4i32 __attribute((vector_size(16))); v4i32 *y = &(v4i32){1,2,3,4}; ``` That because the current interpreter handle vector unary operator as a fallback when the generic code path fail. but the new interpreter was not. we need to handle `UO_AddrOf` in `Compiler::VisitVectorUnaryOperator`. Signed-off-by: yronglin --- clang/lib/AST/ByteCode/Compiler.cpp | 4 ++-- clang/test/CodeGen/compound-literal.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 6a77323d939791..9bd77edb0a550f 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -5324,11 +5324,11 @@ bool Compiler::VisitVectorUnaryOperator(const UnaryOperator *E) { auto UnaryOp = E->getOpcode(); if (UnaryOp != UO_Plus && UnaryOp != UO_Minus && UnaryOp != UO_LNot && - UnaryOp != UO_Not) + UnaryOp != UO_Not && UnaryOp != UO_AddrOf) return this->emitInvalid(E); // Nothing to do here. - if (UnaryOp == UO_Plus) + if (UnaryOp == UO_Plus || UnaryOp == UO_AddrOf) return this->delegate(SubExpr); if (!Initializing) { diff --git a/clang/test/CodeGen/compound-literal.c b/clang/test/CodeGen/compound-literal.c index 5b3cebb7c6ae6a..5fe9594c0f954f 100644 --- a/clang/test/CodeGen/compound-literal.c +++ b/clang/test/CodeGen/compound-literal.c @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -fexperimental-new-constant-interpreter -emit-llvm %s -o - | FileCheck %s // Capture the type and name so matching later is cleaner. struct CompoundTy { int a; }; From 1b32c3e2985f89900030289eaa44e3d92cab85af Mon Sep 17 00:00:00 2001 From: Danial Klimkin Date: Fri, 30 Aug 2024 11:51:44 +0200 Subject: [PATCH 07/98] Add no-op handing for HLSLAttributedResource switch cases (#106698) New value added in e00e9a3f8294c9b96cb0328bf136fab72aeec749 --- lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index 695801da9da69a..b0f49ebf2d2cbb 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -4241,6 +4241,9 @@ TypeSystemClang::GetTypeClass(lldb::opaque_compiler_type_t type) { // We don't handle pack indexing yet case clang::Type::PackIndexing: break; + + case clang::Type::HLSLAttributedResource: + break; } // We don't know hot to display this type... return lldb::eTypeClassOther; @@ -5148,6 +5151,9 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type, // We don't handle pack indexing yet case clang::Type::PackIndexing: break; + + case clang::Type::HLSLAttributedResource: + break; } count = 0; return lldb::eEncodingInvalid; @@ -5309,6 +5315,9 @@ lldb::Format TypeSystemClang::GetFormat(lldb::opaque_compiler_type_t type) { // We don't handle pack indexing yet case clang::Type::PackIndexing: break; + + case clang::Type::HLSLAttributedResource: + break; } // We don't know hot to display this type... return lldb::eFormatBytes; From a2a93f02930e20930d5ef38464ca9c99eb00ff23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= Date: Fri, 30 Aug 2024 11:57:37 +0200 Subject: [PATCH 08/98] [clang] Cleanup IncludeLocMap (#106241) CompilerInstance can re-use same SourceManager across multiple frontendactions. During this process it calls `SourceManager::clearIDTables` to reset any caches based on FileIDs. It didn't reset IncludeLocMap, resulting in wrong include locations for workflows that triggered multiple frontend-actions through same CompilerInstance. --- clang/lib/Basic/SourceManager.cpp | 1 + clang/unittests/Basic/SourceManagerTest.cpp | 60 +++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index b0256a8ce9ed04..d6ec26af80aadd 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -350,6 +350,7 @@ void SourceManager::clearIDTables() { LastLineNoContentCache = nullptr; LastFileIDLookup = FileID(); + IncludedLocMap.clear(); if (LineTable) LineTable->clear(); diff --git a/clang/unittests/Basic/SourceManagerTest.cpp b/clang/unittests/Basic/SourceManagerTest.cpp index 45840f5188cdcd..0f2476bd8b0612 100644 --- a/clang/unittests/Basic/SourceManagerTest.cpp +++ b/clang/unittests/Basic/SourceManagerTest.cpp @@ -20,6 +20,7 @@ #include "clang/Lex/PreprocessorOptions.h" #include "llvm/ADT/SmallString.h" #include "llvm/Config/llvm-config.h" +#include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Process.h" #include "gtest/gtest.h" #include @@ -453,6 +454,65 @@ TEST_F(SourceManagerTest, loadedSLocEntryIsInTheSameTranslationUnit) { #if defined(LLVM_ON_UNIX) +// A single SourceManager instance is sometimes reused across multiple +// compilations. This test makes sure we're resetting caches built for tracking +// include locations that are based on FileIDs, to make sure we don't report +// wrong include locations when FileIDs coincide between two different runs. +TEST_F(SourceManagerTest, ResetsIncludeLocMap) { + auto ParseFile = [&] { + TrivialModuleLoader ModLoader; + HeaderSearch HeaderInfo(std::make_shared(), SourceMgr, + Diags, LangOpts, &*Target); + Preprocessor PP(std::make_shared(), Diags, LangOpts, + SourceMgr, HeaderInfo, ModLoader, + /*IILookup =*/nullptr, + /*OwnsHeaderSearch =*/false); + PP.Initialize(*Target); + PP.EnterMainSourceFile(); + PP.LexTokensUntilEOF(); + EXPECT_FALSE(Diags.hasErrorOccurred()); + }; + + auto Buf = llvm::MemoryBuffer::getMemBuffer(""); + FileEntryRef HeaderFile = + FileMgr.getVirtualFileRef("/foo.h", Buf->getBufferSize(), 0); + SourceMgr.overrideFileContents(HeaderFile, std::move(Buf)); + + Buf = llvm::MemoryBuffer::getMemBuffer(R"cpp(#include "/foo.h")cpp"); + FileEntryRef BarFile = + FileMgr.getVirtualFileRef("/bar.h", Buf->getBufferSize(), 0); + SourceMgr.overrideFileContents(BarFile, std::move(Buf)); + SourceMgr.createFileID(BarFile, {}, clang::SrcMgr::C_User); + + Buf = llvm::MemoryBuffer::getMemBuffer(R"cpp(#include "/foo.h")cpp"); + FileID MFID = SourceMgr.createFileID(std::move(Buf)); + SourceMgr.setMainFileID(MFID); + + ParseFile(); + auto FooFID = SourceMgr.getOrCreateFileID(HeaderFile, clang::SrcMgr::C_User); + auto IncFID = SourceMgr.getDecomposedIncludedLoc(FooFID).first; + EXPECT_EQ(IncFID, MFID); + + // Clean up source-manager state before we start next parse. + SourceMgr.clearIDTables(); + + // Set up a new main file. + Buf = llvm::MemoryBuffer::getMemBuffer(R"cpp( + // silly comment 42 + #include "/bar.h")cpp"); + MFID = SourceMgr.createFileID(std::move(Buf)); + SourceMgr.setMainFileID(MFID); + + ParseFile(); + // Make sure foo.h got the same file-id in both runs. + EXPECT_EQ(FooFID, + SourceMgr.getOrCreateFileID(HeaderFile, clang::SrcMgr::C_User)); + auto BarFID = SourceMgr.getOrCreateFileID(BarFile, clang::SrcMgr::C_User); + IncFID = SourceMgr.getDecomposedIncludedLoc(FooFID).first; + // Check that includer is bar.h during this run. + EXPECT_EQ(IncFID, BarFID); +} + TEST_F(SourceManagerTest, getMacroArgExpandedLocation) { const char *header = "#define FM(x,y) x\n"; From c4b5cb0f31227074d423b2db378dfbc486a5550e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 30 Aug 2024 10:58:17 +0100 Subject: [PATCH 09/98] [AArch64] Add accelerate test coverage for acos/asin/atan and cosh/sinh/tanh intrinsics to support #106584 --- ...ccelerate-vector-functions-inseltpoison.ll | 240 ++++++++++++++++++ .../AArch64/accelerate-vector-functions.ll | 240 ++++++++++++++++++ 2 files changed, 480 insertions(+) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll index eae38295ba08cf..809059034c7f98 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll @@ -611,6 +611,46 @@ entry: %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 ret <4 x float> %vecins.3 } +define <4 x float> @int_asin_4x(ptr %a) { +; CHECK-LABEL: @int_asin_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @int_asin_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.asin.f32(float %vecext) + %vecins = insertelement <4 x float> poison, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.asin.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.asin.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.asin.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} declare float @acosf(float) readonly nounwind willreturn define <4 x float> @acos_4x(ptr %a) { ; CHECK-LABEL: @acos_4x( @@ -652,6 +692,46 @@ entry: %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 ret <4 x float> %vecins.3 } +define <4 x float> @int_acos_4x(ptr %a) { +; CHECK-LABEL: @int_acos_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @int_acos_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.acos.f32(float %vecext) + %vecins = insertelement <4 x float> poison, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.acos.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.acos.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.acos.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} declare float @atanf(float) readonly nounwind willreturn define <4 x float> @atan_4x(ptr %a) { ; CHECK-LABEL: @atan_4x( @@ -693,6 +773,46 @@ entry: %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 ret <4 x float> %vecins.3 } +define <4 x float> @int_atan_4x(ptr %a) { +; CHECK-LABEL: @int_atan_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @int_atan_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.atan.f32(float %vecext) + %vecins = insertelement <4 x float> poison, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.atan.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.atan.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.atan.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} declare float @sinhf(float) readonly nounwind willreturn define <4 x float> @sinh_4x(ptr %a) { ; CHECK-LABEL: @sinh_4x( @@ -734,6 +854,46 @@ entry: %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 ret <4 x float> %vecins.3 } +define <4 x float> @int_sinh_4x(ptr %a) { +; CHECK-LABEL: @int_sinh_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @int_sinh_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.sinh.f32(float %vecext) + %vecins = insertelement <4 x float> poison, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.sinh.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.sinh.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.sinh.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} declare float @coshf(float) readonly nounwind willreturn define <4 x float> @cosh_4x(ptr %a) { ; CHECK-LABEL: @cosh_4x( @@ -775,6 +935,46 @@ entry: %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 ret <4 x float> %vecins.3 } +define <4 x float> @int_cosh_4x(ptr %a) { +; CHECK-LABEL: @int_cosh_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @int_cosh_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.cosh.f32(float %vecext) + %vecins = insertelement <4 x float> poison, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.cosh.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.cosh.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.cosh.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} declare float @tanhf(float) readonly nounwind willreturn define <4 x float> @tanh_4x(ptr %a) { ; CHECK-LABEL: @tanh_4x( @@ -816,6 +1016,46 @@ entry: %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 ret <4 x float> %vecins.3 } +define <4 x float> @int_tanh_4x(ptr %a) { +; CHECK-LABEL: @int_tanh_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @int_tanh_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.tanh.f32(float %vecext) + %vecins = insertelement <4 x float> poison, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.tanh.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.tanh.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.tanh.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} declare float @asinhf(float) readonly nounwind willreturn define <4 x float> @asinh_4x(ptr %a) { ; CHECK-LABEL: @asinh_4x( diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll index 5e2dd305f05576..36633a1053b14f 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll @@ -611,6 +611,46 @@ entry: %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 ret <4 x float> %vecins.3 } +define <4 x float> @int_asin_4x(ptr %a) { +; CHECK-LABEL: @int_asin_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @int_asin_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.asin.f32(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.asin.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.asin.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.asin.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} declare float @acosf(float) readonly nounwind willreturn define <4 x float> @acos_4x(ptr %a) { ; CHECK-LABEL: @acos_4x( @@ -652,6 +692,46 @@ entry: %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 ret <4 x float> %vecins.3 } +define <4 x float> @int_acos_4x(ptr %a) { +; CHECK-LABEL: @int_acos_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @int_acos_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.acos.f32(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.acos.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.acos.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.acos.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} declare float @atanf(float) readonly nounwind willreturn define <4 x float> @atan_4x(ptr %a) { ; CHECK-LABEL: @atan_4x( @@ -693,6 +773,46 @@ entry: %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 ret <4 x float> %vecins.3 } +define <4 x float> @int_atan_4x(ptr %a) { +; CHECK-LABEL: @int_atan_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @int_atan_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.atan.f32(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.atan.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.atan.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.atan.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} declare float @sinhf(float) readonly nounwind willreturn define <4 x float> @sinh_4x(ptr %a) { ; CHECK-LABEL: @sinh_4x( @@ -734,6 +854,46 @@ entry: %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 ret <4 x float> %vecins.3 } +define <4 x float> @int_sinh_4x(ptr %a) { +; CHECK-LABEL: @int_sinh_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @int_sinh_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.sinh.f32(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.sinh.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.sinh.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.sinh.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} declare float @coshf(float) readonly nounwind willreturn define <4 x float> @cosh_4x(ptr %a) { ; CHECK-LABEL: @cosh_4x( @@ -775,6 +935,46 @@ entry: %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 ret <4 x float> %vecins.3 } +define <4 x float> @int_cosh_4x(ptr %a) { +; CHECK-LABEL: @int_cosh_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @int_cosh_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.cosh.f32(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.cosh.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.cosh.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.cosh.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} declare float @tanhf(float) readonly nounwind willreturn define <4 x float> @tanh_4x(ptr %a) { ; CHECK-LABEL: @tanh_4x( @@ -816,6 +1016,46 @@ entry: %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 ret <4 x float> %vecins.3 } +define <4 x float> @int_tanh_4x(ptr %a) { +; CHECK-LABEL: @int_tanh_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @int_tanh_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.tanh.f32(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.tanh.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.tanh.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.tanh.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} declare float @asinhf(float) readonly nounwind willreturn define <4 x float> @asinh_4x(ptr %a) { ; CHECK-LABEL: @asinh_4x( From 833ce5d27b4e5452db73bf1b4eace7b1891f8650 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Fri, 30 Aug 2024 10:16:44 +0000 Subject: [PATCH 10/98] [mlir][ArmSME] Fix test after #98043 (NFC) --- .../Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir index cc05940fb6d02c..0ee016627440f7 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir @@ -22,7 +22,7 @@ func.func @test_outerproduct_no_accumulator_4x4xf32() { %c0 = arith.constant 0 : index - %vector_i32 = llvm.intr.experimental.stepvector : vector<[4]xi32> + %vector_i32 = llvm.intr.stepvector : vector<[4]xi32> %vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32> %tile = vector.outerproduct %vector, %vector : vector<[4]xf32>, vector<[4]xf32> @@ -47,7 +47,7 @@ func.func @test_outerproduct_with_accumulator_4x4xf32() { %f10 = arith.constant 10.0 : f32 %acc = vector.splat %f10 : vector<[4]x[4]xf32> - %vector_i32 = llvm.intr.experimental.stepvector : vector<[4]xi32> + %vector_i32 = llvm.intr.stepvector : vector<[4]xi32> %vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32> %tile = vector.outerproduct %vector, %vector, %acc : vector<[4]xf32>, vector<[4]xf32> @@ -71,7 +71,7 @@ func.func @test_masked_outerproduct_no_accumulator_4x4xf32() { %c0 = arith.constant 0 : index %ones = arith.constant dense<1> : vector<[4]xi32> - %step_vector = llvm.intr.experimental.stepvector : vector<[4]xi32> + %step_vector = llvm.intr.stepvector : vector<[4]xi32> %vector_i32 = arith.addi %step_vector, %ones : vector<[4]xi32> %vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32> @@ -104,7 +104,7 @@ func.func @test_masked_outerproduct_with_accumulator_4x4xf32() { %f10 = arith.constant 10.0 : f32 %acc = vector.splat %f10 : vector<[4]x[4]xf32> - %step_vector = llvm.intr.experimental.stepvector : vector<[4]xi32> + %step_vector = llvm.intr.stepvector : vector<[4]xi32> %vector_i32 = arith.addi %step_vector, %ones : vector<[4]xi32> %vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32> From c8568f09577e9332d15edf98beb5376dc8d0672e Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Fri, 30 Aug 2024 18:28:21 +0800 Subject: [PATCH 11/98] [mlir][tosa] Add missing check for mutiples of `tosa.tile` (#106337) This patch adds check for mutiples of `tosa.tile`. The `multiples` in `tosa.tile` indicates how many times the tensor should be replicated along each dimension. Zero and negative values are invalid, except for -1, which represents a dynamic value. Therefore, each element of `mutiples` should be positive integer or -1. Fix #106167. --- mlir/lib/Dialect/Tosa/IR/TosaOps.cpp | 4 ++++ mlir/test/Dialect/Tosa/invalid.mlir | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index c8e2b04eea0e22..267a875710ed71 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -930,6 +930,10 @@ LogicalResult tosa::TileOp::verify() { return emitOpError("expect 'multiples' array to have length ") << outputType.getRank() << " but got " << multiples.size() << "."; + if (llvm::any_of(multiples, [](int64_t v) { return v <= 0 && v != -1; })) + return emitOpError( + "expect element of 'multiples' to be positive integer or -1."); + return success(); } diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir index 806ba22e1bbe8c..e72e154f952771 100644 --- a/mlir/test/Dialect/Tosa/invalid.mlir +++ b/mlir/test/Dialect/Tosa/invalid.mlir @@ -424,6 +424,24 @@ func.func @test_tile_invalid_multiples() { // ----- +func.func @test_tile_invalid_multiples_value() { + %0 = tensor.empty() : tensor<4x31xf32> + // expected-error@+1 {{'tosa.tile' op expect element of 'multiples' to be positive integer or -1.}} + %1 = tosa.tile %0 {multiples = array} : (tensor<4x31xf32>) -> tensor<4x31xf32> + return +} + +// ----- + +func.func @test_tile_io_rank_mismatch() { + %0 = tensor.empty() : tensor<4x31xf32> + // expected-error@+1 {{'tosa.tile' op expect same input and output tensor rank.}} + %1 = tosa.tile %0 {multiples = array} : (tensor<4x31xf32>) -> tensor<4x31x31xf32> + return +} + +// ----- + // CHECK-LABEL: @test_invalid_constant_permutation func.func @test_invalid_constant_permutation() { // expected-error@+3 {{permutation must be within input bounds}} From f0e34f381866b82a26241f7e9aa5964f0dd11ebd Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 30 Aug 2024 11:29:29 +0100 Subject: [PATCH 12/98] [VPlan] Don't skip optimizable truncs in planContainsAdditionalSimps. A optimizable cast can also be removed by VPlan simplifications. Remove the restriction from planContainsAdditionalSimplifications, as this causes it to miss relevant simplifications, triggering false positives for the cost decision verification. Also adds debug output for printing additional cost-precomputations. Fixes https://github.com/llvm/llvm-project/issues/106641. --- .../Transforms/Vectorize/LoopVectorize.cpp | 34 ++++----- .../truncate-to-minimal-bitwidth-cost.ll | 73 +++++++++++++++++++ 2 files changed, 90 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6babfd1eee9108..fa05b8dd22426f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7147,7 +7147,12 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, if (!OrigLoop->contains(CondI) || !CostCtx.SkipCostComputation.insert(CondI).second) continue; - Cost += CostCtx.getLegacyCost(CondI, VF); + InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF); + LLVM_DEBUG({ + dbgs() << "Cost of " << CondICost << " for VF " << VF + << ": exit condition instruction " << *CondI << "\n"; + }); + Cost += CondICost; for (Value *Op : CondI->operands()) { auto *OpI = dyn_cast(Op); if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) { @@ -7250,10 +7255,9 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, /// not have corresponding recipes in \p Plan and are not marked to be ignored /// in \p CostCtx. This means the VPlan contains simplification that the legacy /// cost-model did not account for. -static bool -planContainsAdditionalSimplifications(VPlan &Plan, ElementCount VF, - VPCostContext &CostCtx, Loop *TheLoop, - LoopVectorizationCostModel &CM) { +static bool planContainsAdditionalSimplifications(VPlan &Plan, + VPCostContext &CostCtx, + Loop *TheLoop) { // First collect all instructions for the recipes in Plan. auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * { if (auto *S = dyn_cast(R)) @@ -7284,16 +7288,13 @@ planContainsAdditionalSimplifications(VPlan &Plan, ElementCount VF, // Return true if the loop contains any instructions that are not also part of // the VPlan or are skipped for VPlan-based cost computations. This indicates // that the VPlan contains extra simplifications. - return any_of( - TheLoop->blocks(), [&SeenInstrs, VF, &CostCtx, &CM](BasicBlock *BB) { - return any_of(*BB, [&SeenInstrs, VF, &CostCtx, &CM](Instruction &I) { - if (isa(&I)) - return false; - return !SeenInstrs.contains(&I) && - !CostCtx.skipCostComputation(&I, true) && - !CM.canTruncateToMinimalBitwidth(&I, VF); - }); - }); + return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx](BasicBlock *BB) { + return any_of(*BB, [&SeenInstrs, &CostCtx](Instruction &I) { + if (isa(&I)) + return false; + return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true); + }); + }); } #endif @@ -7364,8 +7365,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { precomputeCosts(BestPlan, BestFactor.Width, CostCtx); assert((BestFactor.Width == LegacyVF.Width || planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), - BestFactor.Width, CostCtx, - OrigLoop, CM)) && + CostCtx, OrigLoop)) && " VPlan cost model and legacy cost model disagreed"); assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) && "when vectorizing, the scalar cost must be computed."); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index 1db718a0e42f9f..3e2f290a497db1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -152,6 +152,77 @@ exit: ret void } +; Test case for https://github.com/llvm/llvm-project/issues/106641. +define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { +; CHECK-LABEL: define void @truncate_to_i1_used_by_branch( +; CHECK-SAME: i8 [[X:%.*]], ptr [[DST:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = trunc <2 x i8> [[BROADCAST_SPLAT]] to <2 x i1> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT2]], +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[VEC_IV]], i32 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 [[TMP1]], i32 2) +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i1> , [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer +; CHECK-NEXT: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> zeroinitializer, <2 x ptr> [[BROADCAST_SPLAT4]], i32 1, <2 x i1> [[TMP3]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[F_039:%.*]] = phi i8 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = or i8 23, [[X]] +; CHECK-NEXT: [[EXTRACT_T:%.*]] = trunc i8 [[TMP4]] to i1 +; CHECK-NEXT: br i1 [[EXTRACT_T]], label %[[THEN:.*]], label %[[LOOP_LATCH]] +; CHECK: [[THEN]]: +; CHECK-NEXT: store i8 0, ptr [[DST]], align 1 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ADD]] = add i8 [[F_039]], 1 +; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[F_039]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[CONV]], 1 +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %f.039 = phi i8 [ 0, %entry ], [ %add, %loop.latch ] + %0 = or i8 23, %x + %extract.t = trunc i8 %0 to i1 + br i1 %extract.t, label %then, label %loop.latch + +then: + store i8 0, ptr %dst, align 1 + br label %loop.latch + +loop.latch: + %add = add i8 %f.039, 1 + %conv = sext i8 %f.039 to i32 + %cmp = icmp slt i32 %conv, 1 + br i1 %cmp, label %loop.header, label %exit + +exit: + ret void +} + +attributes #0 = { "target-features"="+64bit,+v,+zvl256b" } + ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} @@ -159,4 +230,6 @@ exit: ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} ;. From fab925651685505906416dca48469fd9f69ba39a Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Fri, 30 Aug 2024 11:37:25 +0100 Subject: [PATCH 13/98] [LLVM][AArch64] Fix invalid use of AArch64ISD::UZP2 in performConcatVectorsCombine. (#104774) UZP2 requires both operands to match the result type but the combine tries to replace a truncate by passing the pre-truncated operands directly to an UZP2 with the truncated result type. This patch nop-casts the operands to keep the DAG consistent. There should be no changes to the generated code, which is fine as it. This patch also enables more target specific getNode() validation for fixed length vector types. --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 3296f63a9b8876..28ad0abf25703b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -19852,7 +19852,6 @@ static SDValue performConcatVectorsCombine(SDNode *N, // This optimization reduces instruction count. if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR && N00->getOperand(1) == N10->getOperand(1)) { - SDValue N000 = N00->getOperand(0); SDValue N100 = N10->getOperand(0); uint64_t N001ConstVal = N00->getConstantOperandVal(1), @@ -19860,7 +19859,8 @@ static SDValue performConcatVectorsCombine(SDNode *N, NScalarSize = N->getValueType(0).getScalarSizeInBits(); if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) { - + N000 = DAG.getNode(AArch64ISD::NVCAST, dl, VT, N000); + N100 = DAG.getNode(AArch64ISD::NVCAST, dl, VT, N100); SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, dl, VT, N000, N100); SDValue NewShiftConstant = DAG.getConstant(N001ConstVal - NScalarSize, dl, MVT::i32); @@ -29344,8 +29344,10 @@ void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const { assert(OpVT.getSizeInBits() == VT.getSizeInBits() && "Expected vectors of equal size!"); // TODO: Enable assert once bogus creations have been fixed. - // assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()*2 && - // "Expected result vector with half the lanes of its input!"); + if (VT.isScalableVector()) + break; + assert(OpVT.getVectorElementCount() == VT.getVectorElementCount() * 2 && + "Expected result vector with half the lanes of its input!"); break; } case AArch64ISD::TRN1: @@ -29362,7 +29364,9 @@ void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const { assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() && "Expected vectors!"); // TODO: Enable assert once bogus creations have been fixed. - // assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!"); + if (VT.isScalableVector()) + break; + assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!"); break; } } From 68d8b3846ab1e6550910f2a9a685690eee558af2 Mon Sep 17 00:00:00 2001 From: OverMighty Date: Fri, 30 Aug 2024 12:59:05 +0200 Subject: [PATCH 14/98] [builtins] Fix missing main() function in float16/bfloat16 support checks (#104478) The CMake docs state that `check_c_source_compiles()` checks whether the supplied code "can be compiled as a C source file and linked as an executable (so it must contain at least a `main()` function)." https://cmake.org/cmake/help/v3.30/module/CheckCSourceCompiles.html In practice, this command is a wrapper around `try_compile()`: - https://gitlab.kitware.com/cmake/cmake/blob/2904ce00d2ed6ad5dac6d3459af62d8223e06ce0/Modules/CheckCSourceCompiles.cmake#L54 - https://gitlab.kitware.com/cmake/cmake/blob/2904ce00d2ed6ad5dac6d3459af62d8223e06ce0/Modules/Internal/CheckSourceCompiles.cmake#L101 When `CMAKE_SOURCE_DIR` is compiler-rt/lib/builtins/, `CMAKE_TRY_COMPILE_TARGET_TYPE` is set to `STATIC_LIBRARY`, so the checks for `float16` and `bfloat16` support work as intended in a Clang + compiler-rt runtime build for example, as it runs CMake recursively from that directory. However, when using llvm/ or compiler-rt/ as CMake source directory, as `CMAKE_TRY_COMPILE_TARGET_TYPE` defaults to `EXECUTABLE`, these checks will indeed fail if the code doesn't have a `main()` function. This results in LLVM using x86 SIMD registers when generating calls to builtins that, with Arch Linux's compiler-rt package for example, actually use a GPR for their argument or return value as they use `uint16_t` instead of `_Float16`. This had been caught in post-commit review: https://reviews.llvm.org/D145237#4521152. Use of the internal `CMAKE_C_COMPILER_WORKS` variable is not what hides the issue, however. PR #69842 tried to fix this by unconditionally setting `CMAKE_TRY_COMPILE_TARGET_TYPE` to `STATIC_LIBRARY`, but it apparently caused other issues, so it was reverted. This PR just adds a `main()` function in the checks, as per the CMake docs. --- compiler-rt/lib/builtins/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 13adbd6c4d57d9..2c3b0fa84a4782 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -868,10 +868,12 @@ else () endif() endif() endif() - check_c_source_compiles("_Float16 foo(_Float16 x) { return x; }" + check_c_source_compiles("_Float16 foo(_Float16 x) { return x; } + int main(void) { return 0; }" COMPILER_RT_HAS_${arch}_FLOAT16) append_list_if(COMPILER_RT_HAS_${arch}_FLOAT16 -DCOMPILER_RT_HAS_FLOAT16 BUILTIN_CFLAGS_${arch}) - check_c_source_compiles("__bf16 foo(__bf16 x) { return x; }" + check_c_source_compiles("__bf16 foo(__bf16 x) { return x; } + int main(void) { return 0; }" COMPILER_RT_HAS_${arch}_BFLOAT16) # Build BF16 files only when "__bf16" is available. if(COMPILER_RT_HAS_${arch}_BFLOAT16) From b065ec0af54988559334314ebbd51dd515d5bdd6 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 30 Aug 2024 11:50:25 +0100 Subject: [PATCH 15/98] [Inline][X86] Regenerate inline-target-cpu-* tests --- .../Inline/X86/inline-target-cpu-i686.ll | 9 +++++-- .../Inline/X86/inline-target-cpu-x86_64.ll | 27 +++++++++++++------ 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/llvm/test/Transforms/Inline/X86/inline-target-cpu-i686.ll b/llvm/test/Transforms/Inline/X86/inline-target-cpu-i686.ll index bd05cffcaa8b7f..187278d1c9035a 100644 --- a/llvm/test/Transforms/Inline/X86/inline-target-cpu-i686.ll +++ b/llvm/test/Transforms/Inline/X86/inline-target-cpu-i686.ll @@ -1,12 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=i686-unknown-unknown -S -passes=inline | FileCheck %s define i32 @func_target_cpu_nocona() #0 { +; CHECK-LABEL: @func_target_cpu_nocona( +; CHECK-NEXT: ret i32 0 +; ret i32 0 } -; CHECK-LABEL: @target_cpu_prescott_call_target_cpu_nocona( -; CHECK-NEXT: ret i32 0 define i32 @target_cpu_prescott_call_target_cpu_nocona() #1 { +; CHECK-LABEL: @target_cpu_prescott_call_target_cpu_nocona( +; CHECK-NEXT: ret i32 0 +; %call = call i32 @func_target_cpu_nocona() ret i32 %call } diff --git a/llvm/test/Transforms/Inline/X86/inline-target-cpu-x86_64.ll b/llvm/test/Transforms/Inline/X86/inline-target-cpu-x86_64.ll index b0a145d54cf593..e6693a637d820d 100644 --- a/llvm/test/Transforms/Inline/X86/inline-target-cpu-x86_64.ll +++ b/llvm/test/Transforms/Inline/X86/inline-target-cpu-x86_64.ll @@ -1,37 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown-unknown -S -passes=inline | FileCheck %s define i32 @func_target_cpu_base() #0 { +; CHECK-LABEL: @func_target_cpu_base( +; CHECK-NEXT: ret i32 0 +; ret i32 0 } -; CHECK-LABEL: @target_cpu_k8_call_target_cpu_base( -; CHECK-NEXT: ret i32 0 define i32 @target_cpu_k8_call_target_cpu_base() #1 { +; CHECK-LABEL: @target_cpu_k8_call_target_cpu_base( +; CHECK-NEXT: ret i32 0 +; %call = call i32 @func_target_cpu_base() ret i32 %call } -; CHECK-LABEL: @target_cpu_target_nehalem_call_target_cpu_base( -; CHECK-NEXT: ret i32 0 define i32 @target_cpu_target_nehalem_call_target_cpu_base() #2 { +; CHECK-LABEL: @target_cpu_target_nehalem_call_target_cpu_base( +; CHECK-NEXT: ret i32 0 +; %call = call i32 @func_target_cpu_base() ret i32 %call } -; CHECK-LABEL: @target_cpu_target_goldmont_call_target_cpu_base( -; CHECK-NEXT: ret i32 0 define i32 @target_cpu_target_goldmont_call_target_cpu_base() #3 { +; CHECK-LABEL: @target_cpu_target_goldmont_call_target_cpu_base( +; CHECK-NEXT: ret i32 0 +; %call = call i32 @func_target_cpu_base() ret i32 %call } define i32 @func_target_cpu_nocona() #4 { +; CHECK-LABEL: @func_target_cpu_nocona( +; CHECK-NEXT: ret i32 0 +; ret i32 0 } -; CHECK-LABEL: @target_cpu_target_base_call_target_cpu_nocona( -; CHECK-NEXT: ret i32 0 define i32 @target_cpu_target_base_call_target_cpu_nocona() #0 { +; CHECK-LABEL: @target_cpu_target_base_call_target_cpu_nocona( +; CHECK-NEXT: ret i32 0 +; %call = call i32 @func_target_cpu_nocona() ret i32 %call } From fda7649b3c3797ddbb35a46746ae7876ab147612 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 30 Aug 2024 12:06:05 +0100 Subject: [PATCH 16/98] [InstCombine][X86] Split off vperm shuffle tests from other avx512 tests --- .../X86/x86-avx512-inseltpoison.ll | 1360 ---------------- .../Transforms/InstCombine/X86/x86-avx512.ll | 1360 ---------------- .../Transforms/InstCombine/X86/x86-vperm.ll | 1362 +++++++++++++++++ 3 files changed, 1362 insertions(+), 2720 deletions(-) create mode 100644 llvm/test/Transforms/InstCombine/X86/x86-vperm.ll diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll index 80d8e1b16ed28b..3c44da84813fdb 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll @@ -1814,1366 +1814,6 @@ define double @test_mask3_vfnmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> ret double %13 } -declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) - -define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_si_256( -; CHECK-NEXT: ret <8 x i32> [[A0:%.*]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) - ret <8 x i32> %1 -} - -define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_si_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[A0:%.*]], <8 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i32> [[TMP2]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru - ret <8 x i32> %3 -} - -define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_si_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: ret <8 x i32> [[TMP1]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer) - ret <8 x i32> %1 -} - -define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_si_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru - ret <8 x i32> %3 -} - -define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_si_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP1]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) - ret <8 x i32> %1 -} - -define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_si_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru - ret <8 x i32> %3 -} - -define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_si_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP1]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) - ret <8 x i32> %1 -} - -define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_si_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru - ret <8 x i32> %3 -} - -declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) - -define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_sf_256( -; CHECK-NEXT: ret <8 x float> [[A0:%.*]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) - ret <8 x float> %1 -} - -define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_sf_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[A0:%.*]], <8 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x float> [[TMP2]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru - ret <8 x float> %3 -} - -define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_sf_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: ret <8 x float> [[TMP1]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer) - ret <8 x float> %1 -} - -define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_sf_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x float> [[TMP3]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru - ret <8 x float> %3 -} - -define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_sf_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP1]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) - ret <8 x float> %1 -} - -define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_sf_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x float> [[TMP3]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru - ret <8 x float> %3 -} - -define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_sf_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP1]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) - ret <8 x float> %1 -} - -define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_sf_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x float> [[TMP3]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru - ret <8 x float> %3 -} - -declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) - -define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_di_256( -; CHECK-NEXT: ret <4 x i64> [[A0:%.*]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) - ret <4 x i64> %1 -} - -define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_di_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[A0:%.*]], <4 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x i64> [[TMP2]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru - ret <4 x i64> %3 -} - -define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_di_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: ret <4 x i64> [[TMP1]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer) - ret <4 x i64> %1 -} - -define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_di_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru - ret <4 x i64> %3 -} - -define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_di_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> -; CHECK-NEXT: ret <4 x i64> [[TMP1]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) - ret <4 x i64> %1 -} - -define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_di_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru - ret <4 x i64> %3 -} - -define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_di_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> -; CHECK-NEXT: ret <4 x i64> [[TMP1]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) - ret <4 x i64> %1 -} - -define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_di_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru - ret <4 x i64> %3 -} - -declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>) - -define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_df_256( -; CHECK-NEXT: ret <4 x double> [[A0:%.*]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) - ret <4 x double> %1 -} - -define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_df_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[A0:%.*]], <4 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x double> [[TMP2]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru - ret <4 x double> %3 -} - -define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_df_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: ret <4 x double> [[TMP1]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer) - ret <4 x double> %1 -} - -define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_df_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x double> [[TMP3]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru - ret <4 x double> %3 -} - -define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_df_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: ret <4 x double> [[TMP1]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) - ret <4 x double> %1 -} - -define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_df_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x double> [[TMP3]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru - ret <4 x double> %3 -} - -define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_df_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: ret <4 x double> [[TMP1]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) - ret <4 x double> %1 -} - -define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_df_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x double> [[TMP3]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru - ret <4 x double> %3 -} - -declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) - -define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_si_512( -; CHECK-NEXT: ret <16 x i32> [[A0:%.*]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) - ret <16 x i32> %1 -} - -define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_si_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[A0:%.*]], <16 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i32> [[TMP2]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru - ret <16 x i32> %3 -} - -define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_si_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: ret <16 x i32> [[TMP1]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer) - ret <16 x i32> %1 -} - -define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_si_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru - ret <16 x i32> %3 -} - -define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_si_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: ret <16 x i32> [[TMP1]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) - ret <16 x i32> %1 -} - -define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_si_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru - ret <16 x i32> %3 -} - -define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_si_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: ret <16 x i32> [[TMP1]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) - ret <16 x i32> %1 -} - -define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_si_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru - ret <16 x i32> %3 -} - -declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) - -define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_sf_512( -; CHECK-NEXT: ret <16 x float> [[A0:%.*]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) - ret <16 x float> %1 -} - -define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_sf_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[A0:%.*]], <16 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x float> [[TMP2]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru - ret <16 x float> %3 -} - -define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_sf_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: ret <16 x float> [[TMP1]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer) - ret <16 x float> %1 -} - -define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_sf_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x float> [[TMP3]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru - ret <16 x float> %3 -} - -define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_sf_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: ret <16 x float> [[TMP1]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) - ret <16 x float> %1 -} - -define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_sf_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x float> [[TMP3]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru - ret <16 x float> %3 -} - -define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_sf_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: ret <16 x float> [[TMP1]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) - ret <16 x float> %1 -} - -define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_sf_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x float> [[TMP3]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru - ret <16 x float> %3 -} - -declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) - -define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_di_512( -; CHECK-NEXT: ret <8 x i64> [[A0:%.*]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) - ret <8 x i64> %1 -} - -define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_di_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[A0:%.*]], <8 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i64> [[TMP2]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru - ret <8 x i64> %3 -} - -define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: ret <8 x i64> [[TMP1]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer) - ret <8 x i64> %1 -} - -define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_di_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru - ret <8 x i64> %3 -} - -define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i64> [[TMP1]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) - ret <8 x i64> %1 -} - -define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_di_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru - ret <8 x i64> %3 -} - -define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i64> [[TMP1]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) - ret <8 x i64> %1 -} - -define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_di_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru - ret <8 x i64> %3 -} - -declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) - -define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_df_512( -; CHECK-NEXT: ret <8 x double> [[A0:%.*]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) - ret <8 x double> %1 -} - -define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_df_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[A0:%.*]], <8 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x double> [[TMP2]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru - ret <8 x double> %3 -} - -define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_df_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: ret <8 x double> [[TMP1]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer) - ret <8 x double> %1 -} - -define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_df_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x double> [[TMP3]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru - ret <8 x double> %3 -} - -define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_df_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: ret <8 x double> [[TMP1]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) - ret <8 x double> %1 -} - -define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_df_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x double> [[TMP3]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru - ret <8 x double> %3 -} - -define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_df_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: ret <8 x double> [[TMP1]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) - ret <8 x double> %1 -} - -define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_df_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x double> [[TMP3]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru - ret <8 x double> %3 -} - -declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>) - -define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_hi_128( -; CHECK-NEXT: ret <8 x i16> [[A0:%.*]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) - ret <8 x i16> %1 -} - -define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_hi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A0:%.*]], <8 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i16> [[TMP2]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru - ret <8 x i16> %3 -} - -define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_hi_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: ret <8 x i16> [[TMP1]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer) - ret <8 x i16> %1 -} - -define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_hi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i16> [[TMP3]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru - ret <8 x i16> %3 -} - -define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_hi_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i16> [[TMP1]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) - ret <8 x i16> %1 -} - -define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_hi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i16> [[TMP3]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru - ret <8 x i16> %3 -} - -define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_hi_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i16> [[TMP1]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) - ret <8 x i16> %1 -} - -define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_hi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i16> [[TMP3]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru - ret <8 x i16> %3 -} - -declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>) - -define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_hi_256( -; CHECK-NEXT: ret <16 x i16> [[A0:%.*]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) - ret <16 x i16> %1 -} - -define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_hi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[A0:%.*]], <16 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i16> [[TMP2]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru - ret <16 x i16> %3 -} - -define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_hi_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: ret <16 x i16> [[TMP1]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer) - ret <16 x i16> %1 -} - -define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_hi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i16> [[TMP3]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru - ret <16 x i16> %3 -} - -define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_hi_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: ret <16 x i16> [[TMP1]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) - ret <16 x i16> %1 -} - -define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_hi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i16> [[TMP3]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru - ret <16 x i16> %3 -} - -define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_hi_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: ret <16 x i16> [[TMP1]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) - ret <16 x i16> %1 -} - -define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_hi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i16> [[TMP3]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru - ret <16 x i16> %3 -} - -declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) - -define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_hi_512( -; CHECK-NEXT: ret <32 x i16> [[A0:%.*]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) - ret <32 x i16> %1 -} - -define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_hi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> [[A0:%.*]], <32 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i16> [[TMP2]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru - ret <32 x i16> %3 -} - -define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_hi_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer -; CHECK-NEXT: ret <32 x i16> [[TMP1]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer) - ret <32 x i16> %1 -} - -define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_hi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i16> [[TMP3]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru - ret <32 x i16> %3 -} - -define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_hi_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: ret <32 x i16> [[TMP1]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) - ret <32 x i16> %1 -} - -define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_hi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i16> [[TMP3]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru - ret <32 x i16> %3 -} - -define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_hi_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: ret <32 x i16> [[TMP1]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) - ret <32 x i16> %1 -} - -define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_hi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i16> [[TMP3]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru - ret <32 x i16> %3 -} - -declare <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8>, <16 x i8>) - -define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_qi_128( -; CHECK-NEXT: ret <16 x i8> [[A0:%.*]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) - ret <16 x i8> %1 -} - -define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_qi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A0:%.*]], <16 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i8> [[TMP2]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru - ret <16 x i8> %3 -} - -define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_qi_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: ret <16 x i8> [[TMP1]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer) - ret <16 x i8> %1 -} - -define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_qi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i8> [[TMP3]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru - ret <16 x i8> %3 -} - -define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_qi_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: ret <16 x i8> [[TMP1]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) - ret <16 x i8> %1 -} - -define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_qi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i8> [[TMP3]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru - ret <16 x i8> %3 -} - -define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_qi_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: ret <16 x i8> [[TMP1]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) - ret <16 x i8> %1 -} - -define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_qi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i8> [[TMP3]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru - ret <16 x i8> %3 -} - -declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>) - -define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_qi_256( -; CHECK-NEXT: ret <32 x i8> [[A0:%.*]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) - ret <32 x i8> %1 -} - -define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_qi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A0:%.*]], <32 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i8> [[TMP2]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru - ret <32 x i8> %3 -} - -define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_qi_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer -; CHECK-NEXT: ret <32 x i8> [[TMP1]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer) - ret <32 x i8> %1 -} - -define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_qi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i8> [[TMP3]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru - ret <32 x i8> %3 -} - -define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_qi_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: ret <32 x i8> [[TMP1]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) - ret <32 x i8> %1 -} - -define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_qi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i8> [[TMP3]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru - ret <32 x i8> %3 -} - -define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_qi_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: ret <32 x i8> [[TMP1]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) - ret <32 x i8> %1 -} - -define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_qi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i8> [[TMP3]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru - ret <32 x i8> %3 -} - -declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) - -define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_qi_512( -; CHECK-NEXT: ret <64 x i8> [[A0:%.*]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) - ret <64 x i8> %1 -} - -define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_qi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> [[A0:%.*]], <64 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <64 x i8> [[TMP2]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru - ret <64 x i8> %3 -} - -define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_qi_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer -; CHECK-NEXT: ret <64 x i8> [[TMP1]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer) - ret <64 x i8> %1 -} - -define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_qi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <64 x i8> [[TMP3]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru - ret <64 x i8> %3 -} - -define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_qi_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: ret <64 x i8> [[TMP1]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) - ret <64 x i8> %1 -} - -define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_qi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <64 x i8> [[TMP3]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru - ret <64 x i8> %3 -} - -define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_qi_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: ret <64 x i8> [[TMP1]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) - ret <64 x i8> %1 -} - -define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_qi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <64 x i8> [[TMP3]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru - ret <64 x i8> %3 -} - declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) { diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll index 906e84b6074811..d89cf6b0bb9868 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll @@ -1814,1366 +1814,6 @@ define double @test_mask3_vfnmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> ret double %13 } -declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) - -define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_si_256( -; CHECK-NEXT: ret <8 x i32> [[A0:%.*]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) - ret <8 x i32> %1 -} - -define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_si_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[A0:%.*]], <8 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i32> [[TMP2]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru - ret <8 x i32> %3 -} - -define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_si_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: ret <8 x i32> [[TMP1]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer) - ret <8 x i32> %1 -} - -define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_si_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru - ret <8 x i32> %3 -} - -define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_si_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP1]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) - ret <8 x i32> %1 -} - -define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_si_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru - ret <8 x i32> %3 -} - -define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_si_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[TMP1]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) - ret <8 x i32> %1 -} - -define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_si_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] -; - %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru - ret <8 x i32> %3 -} - -declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) - -define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_sf_256( -; CHECK-NEXT: ret <8 x float> [[A0:%.*]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) - ret <8 x float> %1 -} - -define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_sf_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[A0:%.*]], <8 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x float> [[TMP2]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru - ret <8 x float> %3 -} - -define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_sf_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: ret <8 x float> [[TMP1]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer) - ret <8 x float> %1 -} - -define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_sf_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x float> [[TMP3]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru - ret <8 x float> %3 -} - -define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_sf_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP1]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) - ret <8 x float> %1 -} - -define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_sf_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x float> [[TMP3]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru - ret <8 x float> %3 -} - -define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_sf_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: ret <8 x float> [[TMP1]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) - ret <8 x float> %1 -} - -define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_sf_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x float> [[TMP3]] -; - %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru - ret <8 x float> %3 -} - -declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) - -define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_di_256( -; CHECK-NEXT: ret <4 x i64> [[A0:%.*]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) - ret <4 x i64> %1 -} - -define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_di_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[A0:%.*]], <4 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x i64> [[TMP2]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru - ret <4 x i64> %3 -} - -define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_di_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: ret <4 x i64> [[TMP1]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer) - ret <4 x i64> %1 -} - -define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_di_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru - ret <4 x i64> %3 -} - -define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_di_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> -; CHECK-NEXT: ret <4 x i64> [[TMP1]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) - ret <4 x i64> %1 -} - -define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_di_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru - ret <4 x i64> %3 -} - -define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_di_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> -; CHECK-NEXT: ret <4 x i64> [[TMP1]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) - ret <4 x i64> %1 -} - -define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_di_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] -; - %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru - ret <4 x i64> %3 -} - -declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>) - -define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_df_256( -; CHECK-NEXT: ret <4 x double> [[A0:%.*]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) - ret <4 x double> %1 -} - -define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_df_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[A0:%.*]], <4 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x double> [[TMP2]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru - ret <4 x double> %3 -} - -define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_df_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: ret <4 x double> [[TMP1]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer) - ret <4 x double> %1 -} - -define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_df_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x double> [[TMP3]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru - ret <4 x double> %3 -} - -define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_df_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: ret <4 x double> [[TMP1]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) - ret <4 x double> %1 -} - -define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_df_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x double> [[TMP3]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru - ret <4 x double> %3 -} - -define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_df_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: ret <4 x double> [[TMP1]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) - ret <4 x double> %1 -} - -define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_df_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <4 x double> [[TMP3]] -; - %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> - %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru - ret <4 x double> %3 -} - -declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) - -define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_si_512( -; CHECK-NEXT: ret <16 x i32> [[A0:%.*]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) - ret <16 x i32> %1 -} - -define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_si_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[A0:%.*]], <16 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i32> [[TMP2]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru - ret <16 x i32> %3 -} - -define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_si_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: ret <16 x i32> [[TMP1]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer) - ret <16 x i32> %1 -} - -define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_si_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru - ret <16 x i32> %3 -} - -define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_si_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: ret <16 x i32> [[TMP1]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) - ret <16 x i32> %1 -} - -define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_si_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru - ret <16 x i32> %3 -} - -define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_si_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: ret <16 x i32> [[TMP1]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) - ret <16 x i32> %1 -} - -define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_si_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i32> [[TMP3]] -; - %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru - ret <16 x i32> %3 -} - -declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) - -define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_sf_512( -; CHECK-NEXT: ret <16 x float> [[A0:%.*]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) - ret <16 x float> %1 -} - -define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_sf_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[A0:%.*]], <16 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x float> [[TMP2]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru - ret <16 x float> %3 -} - -define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_sf_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: ret <16 x float> [[TMP1]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer) - ret <16 x float> %1 -} - -define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_sf_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x float> [[TMP3]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru - ret <16 x float> %3 -} - -define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_sf_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: ret <16 x float> [[TMP1]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) - ret <16 x float> %1 -} - -define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_sf_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x float> [[TMP3]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru - ret <16 x float> %3 -} - -define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_sf_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: ret <16 x float> [[TMP1]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) - ret <16 x float> %1 -} - -define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_sf_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x float> [[TMP3]] -; - %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru - ret <16 x float> %3 -} - -declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) - -define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_di_512( -; CHECK-NEXT: ret <8 x i64> [[A0:%.*]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) - ret <8 x i64> %1 -} - -define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_di_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[A0:%.*]], <8 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i64> [[TMP2]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru - ret <8 x i64> %3 -} - -define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: ret <8 x i64> [[TMP1]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer) - ret <8 x i64> %1 -} - -define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_di_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru - ret <8 x i64> %3 -} - -define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i64> [[TMP1]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) - ret <8 x i64> %1 -} - -define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_di_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru - ret <8 x i64> %3 -} - -define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_di_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i64> [[TMP1]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) - ret <8 x i64> %1 -} - -define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_di_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i64> [[TMP3]] -; - %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru - ret <8 x i64> %3 -} - -declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) - -define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_df_512( -; CHECK-NEXT: ret <8 x double> [[A0:%.*]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) - ret <8 x double> %1 -} - -define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_df_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[A0:%.*]], <8 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x double> [[TMP2]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru - ret <8 x double> %3 -} - -define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_df_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: ret <8 x double> [[TMP1]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer) - ret <8 x double> %1 -} - -define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_df_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x double> [[TMP3]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru - ret <8 x double> %3 -} - -define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_df_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: ret <8 x double> [[TMP1]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) - ret <8 x double> %1 -} - -define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_df_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x double> [[TMP3]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru - ret <8 x double> %3 -} - -define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_df_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: ret <8 x double> [[TMP1]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) - ret <8 x double> %1 -} - -define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_df_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x double> [[TMP3]] -; - %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru - ret <8 x double> %3 -} - -declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>) - -define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_hi_128( -; CHECK-NEXT: ret <8 x i16> [[A0:%.*]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) - ret <8 x i16> %1 -} - -define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_hi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A0:%.*]], <8 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i16> [[TMP2]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru - ret <8 x i16> %3 -} - -define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_hi_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: ret <8 x i16> [[TMP1]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer) - ret <8 x i16> %1 -} - -define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_hi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i16> [[TMP3]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru - ret <8 x i16> %3 -} - -define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_hi_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i16> [[TMP1]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) - ret <8 x i16> %1 -} - -define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_hi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i16> [[TMP3]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru - ret <8 x i16> %3 -} - -define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_hi_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i16> [[TMP1]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) - ret <8 x i16> %1 -} - -define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_hi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <8 x i16> [[TMP3]] -; - %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru - ret <8 x i16> %3 -} - -declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>) - -define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_hi_256( -; CHECK-NEXT: ret <16 x i16> [[A0:%.*]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) - ret <16 x i16> %1 -} - -define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_hi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[A0:%.*]], <16 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i16> [[TMP2]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru - ret <16 x i16> %3 -} - -define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_hi_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: ret <16 x i16> [[TMP1]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer) - ret <16 x i16> %1 -} - -define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_hi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i16> [[TMP3]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru - ret <16 x i16> %3 -} - -define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_hi_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: ret <16 x i16> [[TMP1]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) - ret <16 x i16> %1 -} - -define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_hi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i16> [[TMP3]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru - ret <16 x i16> %3 -} - -define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_hi_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: ret <16 x i16> [[TMP1]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) - ret <16 x i16> %1 -} - -define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_hi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i16> [[TMP3]] -; - %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru - ret <16 x i16> %3 -} - -declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) - -define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_hi_512( -; CHECK-NEXT: ret <32 x i16> [[A0:%.*]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) - ret <32 x i16> %1 -} - -define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_hi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> [[A0:%.*]], <32 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i16> [[TMP2]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru - ret <32 x i16> %3 -} - -define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_hi_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer -; CHECK-NEXT: ret <32 x i16> [[TMP1]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer) - ret <32 x i16> %1 -} - -define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_hi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i16> [[TMP3]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru - ret <32 x i16> %3 -} - -define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_hi_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: ret <32 x i16> [[TMP1]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) - ret <32 x i16> %1 -} - -define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_hi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i16> [[TMP3]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru - ret <32 x i16> %3 -} - -define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_hi_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: ret <32 x i16> [[TMP1]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) - ret <32 x i16> %1 -} - -define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_hi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i16> [[TMP3]] -; - %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru - ret <32 x i16> %3 -} - -declare <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8>, <16 x i8>) - -define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_qi_128( -; CHECK-NEXT: ret <16 x i8> [[A0:%.*]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) - ret <16 x i8> %1 -} - -define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_qi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A0:%.*]], <16 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i8> [[TMP2]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru - ret <16 x i8> %3 -} - -define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_qi_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: ret <16 x i8> [[TMP1]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer) - ret <16 x i8> %1 -} - -define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_qi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i8> [[TMP3]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru - ret <16 x i8> %3 -} - -define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_qi_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: ret <16 x i8> [[TMP1]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) - ret <16 x i8> %1 -} - -define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_qi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i8> [[TMP3]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru - ret <16 x i8> %3 -} - -define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_qi_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: ret <16 x i8> [[TMP1]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) - ret <16 x i8> %1 -} - -define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_qi_128_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <16 x i8> [[TMP3]] -; - %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru - ret <16 x i8> %3 -} - -declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>) - -define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_qi_256( -; CHECK-NEXT: ret <32 x i8> [[A0:%.*]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) - ret <32 x i8> %1 -} - -define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_qi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A0:%.*]], <32 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i8> [[TMP2]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru - ret <32 x i8> %3 -} - -define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_qi_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer -; CHECK-NEXT: ret <32 x i8> [[TMP1]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer) - ret <32 x i8> %1 -} - -define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_qi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i8> [[TMP3]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru - ret <32 x i8> %3 -} - -define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_qi_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: ret <32 x i8> [[TMP1]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) - ret <32 x i8> %1 -} - -define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_qi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i8> [[TMP3]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru - ret <32 x i8> %3 -} - -define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_qi_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: ret <32 x i8> [[TMP1]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) - ret <32 x i8> %1 -} - -define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_qi_256_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <32 x i8> [[TMP3]] -; - %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru - ret <32 x i8> %3 -} - -declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) - -define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) { -; -; CHECK-LABEL: @identity_test_permvar_qi_512( -; CHECK-NEXT: ret <64 x i8> [[A0:%.*]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) - ret <64 x i8> %1 -} - -define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { -; -; CHECK-LABEL: @identity_test_permvar_qi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> [[A0:%.*]], <64 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <64 x i8> [[TMP2]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru - ret <64 x i8> %3 -} - -define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) { -; -; CHECK-LABEL: @zero_test_permvar_qi_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer -; CHECK-NEXT: ret <64 x i8> [[TMP1]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer) - ret <64 x i8> %1 -} - -define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { -; -; CHECK-LABEL: @zero_test_permvar_qi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <64 x i8> [[TMP3]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru - ret <64 x i8> %3 -} - -define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) { -; -; CHECK-LABEL: @shuffle_test_permvar_qi_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: ret <64 x i8> [[TMP1]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) - ret <64 x i8> %1 -} - -define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { -; -; CHECK-LABEL: @shuffle_test_permvar_qi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <64 x i8> [[TMP3]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru - ret <64 x i8> %3 -} - -define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) { -; -; CHECK-LABEL: @undef_test_permvar_qi_512( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: ret <64 x i8> [[TMP1]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) - ret <64 x i8> %1 -} - -define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { -; -; CHECK-LABEL: @undef_test_permvar_qi_512_mask( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> -; CHECK-NEXT: [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <64 x i8> [[TMP3]] -; - %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) - %2 = bitcast i64 %mask to <64 x i1> - %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru - ret <64 x i8> %3 -} - declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) { diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll b/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll new file mode 100644 index 00000000000000..a0e2d3d6fe9fbe --- /dev/null +++ b/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll @@ -0,0 +1,1362 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s + +declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) + +define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) { +; +; CHECK-LABEL: @identity_test_permvar_si_256( +; CHECK-NEXT: ret <8 x i32> [[A0:%.*]] +; + %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) + ret <8 x i32> %1 +} + +define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { +; +; CHECK-LABEL: @identity_test_permvar_si_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[A0:%.*]], <8 x i32> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru + ret <8 x i32> %3 +} + +define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) { +; +; CHECK-LABEL: @zero_test_permvar_si_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; + %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer) + ret <8 x i32> %1 +} + +define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { +; +; CHECK-LABEL: @zero_test_permvar_si_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; + %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru + ret <8 x i32> %3 +} + +define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) { +; +; CHECK-LABEL: @shuffle_test_permvar_si_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; + %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) + ret <8 x i32> %1 +} + +define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { +; +; CHECK-LABEL: @shuffle_test_permvar_si_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; + %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru + ret <8 x i32> %3 +} + +define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) { +; +; CHECK-LABEL: @undef_test_permvar_si_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; + %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) + ret <8 x i32> %1 +} + +define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { +; +; CHECK-LABEL: @undef_test_permvar_si_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; + %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru + ret <8 x i32> %3 +} + +declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) + +define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) { +; +; CHECK-LABEL: @identity_test_permvar_sf_256( +; CHECK-NEXT: ret <8 x float> [[A0:%.*]] +; + %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) + ret <8 x float> %1 +} + +define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { +; +; CHECK-LABEL: @identity_test_permvar_sf_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[A0:%.*]], <8 x float> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x float> [[TMP2]] +; + %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru + ret <8 x float> %3 +} + +define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) { +; +; CHECK-LABEL: @zero_test_permvar_sf_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: ret <8 x float> [[TMP1]] +; + %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer) + ret <8 x float> %1 +} + +define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { +; +; CHECK-LABEL: @zero_test_permvar_sf_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x float> [[TMP3]] +; + %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru + ret <8 x float> %3 +} + +define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) { +; +; CHECK-LABEL: @shuffle_test_permvar_sf_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP1]] +; + %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) + ret <8 x float> %1 +} + +define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { +; +; CHECK-LABEL: @shuffle_test_permvar_sf_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x float> [[TMP3]] +; + %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru + ret <8 x float> %3 +} + +define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) { +; +; CHECK-LABEL: @undef_test_permvar_sf_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP1]] +; + %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) + ret <8 x float> %1 +} + +define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { +; +; CHECK-LABEL: @undef_test_permvar_sf_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x float> [[TMP3]] +; + %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru + ret <8 x float> %3 +} + +declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) + +define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) { +; +; CHECK-LABEL: @identity_test_permvar_di_256( +; CHECK-NEXT: ret <4 x i64> [[A0:%.*]] +; + %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) + ret <4 x i64> %1 +} + +define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { +; +; CHECK-LABEL: @identity_test_permvar_di_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[A0:%.*]], <4 x i64> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <4 x i64> [[TMP2]] +; + %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru + ret <4 x i64> %3 +} + +define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) { +; +; CHECK-LABEL: @zero_test_permvar_di_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; + %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer) + ret <4 x i64> %1 +} + +define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { +; +; CHECK-LABEL: @zero_test_permvar_di_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; + %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru + ret <4 x i64> %3 +} + +define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) { +; +; CHECK-LABEL: @shuffle_test_permvar_di_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; + %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) + ret <4 x i64> %1 +} + +define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { +; +; CHECK-LABEL: @shuffle_test_permvar_di_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; + %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru + ret <4 x i64> %3 +} + +define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) { +; +; CHECK-LABEL: @undef_test_permvar_di_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; + %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) + ret <4 x i64> %1 +} + +define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { +; +; CHECK-LABEL: @undef_test_permvar_di_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; + %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> ) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru + ret <4 x i64> %3 +} + +declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>) + +define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) { +; +; CHECK-LABEL: @identity_test_permvar_df_256( +; CHECK-NEXT: ret <4 x double> [[A0:%.*]] +; + %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) + ret <4 x double> %1 +} + +define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { +; +; CHECK-LABEL: @identity_test_permvar_df_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[A0:%.*]], <4 x double> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <4 x double> [[TMP2]] +; + %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru + ret <4 x double> %3 +} + +define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) { +; +; CHECK-LABEL: @zero_test_permvar_df_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: ret <4 x double> [[TMP1]] +; + %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer) + ret <4 x double> %1 +} + +define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { +; +; CHECK-LABEL: @zero_test_permvar_df_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <4 x double> [[TMP3]] +; + %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru + ret <4 x double> %3 +} + +define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) { +; +; CHECK-LABEL: @shuffle_test_permvar_df_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: ret <4 x double> [[TMP1]] +; + %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) + ret <4 x double> %1 +} + +define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { +; +; CHECK-LABEL: @shuffle_test_permvar_df_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <4 x double> [[TMP3]] +; + %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru + ret <4 x double> %3 +} + +define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) { +; +; CHECK-LABEL: @undef_test_permvar_df_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: ret <4 x double> [[TMP1]] +; + %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) + ret <4 x double> %1 +} + +define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { +; +; CHECK-LABEL: @undef_test_permvar_df_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <4 x double> [[TMP3]] +; + %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> ) + %2 = bitcast i8 %mask to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru + ret <4 x double> %3 +} + +declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) + +define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) { +; +; CHECK-LABEL: @identity_test_permvar_si_512( +; CHECK-NEXT: ret <16 x i32> [[A0:%.*]] +; + %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) + ret <16 x i32> %1 +} + +define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { +; +; CHECK-LABEL: @identity_test_permvar_si_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[A0:%.*]], <16 x i32> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x i32> [[TMP2]] +; + %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru + ret <16 x i32> %3 +} + +define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) { +; +; CHECK-LABEL: @zero_test_permvar_si_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; + %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer) + ret <16 x i32> %1 +} + +define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { +; +; CHECK-LABEL: @zero_test_permvar_si_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru + ret <16 x i32> %3 +} + +define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) { +; +; CHECK-LABEL: @shuffle_test_permvar_si_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; + %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) + ret <16 x i32> %1 +} + +define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { +; +; CHECK-LABEL: @shuffle_test_permvar_si_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru + ret <16 x i32> %3 +} + +define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) { +; +; CHECK-LABEL: @undef_test_permvar_si_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; + %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) + ret <16 x i32> %1 +} + +define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { +; +; CHECK-LABEL: @undef_test_permvar_si_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> ) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru + ret <16 x i32> %3 +} + +declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) + +define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) { +; +; CHECK-LABEL: @identity_test_permvar_sf_512( +; CHECK-NEXT: ret <16 x float> [[A0:%.*]] +; + %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) + ret <16 x float> %1 +} + +define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; +; CHECK-LABEL: @identity_test_permvar_sf_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[A0:%.*]], <16 x float> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x float> [[TMP2]] +; + %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 +} + +define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) { +; +; CHECK-LABEL: @zero_test_permvar_sf_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: ret <16 x float> [[TMP1]] +; + %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer) + ret <16 x float> %1 +} + +define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; +; CHECK-LABEL: @zero_test_permvar_sf_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x float> [[TMP3]] +; + %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 +} + +define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) { +; +; CHECK-LABEL: @shuffle_test_permvar_sf_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: ret <16 x float> [[TMP1]] +; + %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) + ret <16 x float> %1 +} + +define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; +; CHECK-LABEL: @shuffle_test_permvar_sf_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x float> [[TMP3]] +; + %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 +} + +define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) { +; +; CHECK-LABEL: @undef_test_permvar_sf_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: ret <16 x float> [[TMP1]] +; + %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) + ret <16 x float> %1 +} + +define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; +; CHECK-LABEL: @undef_test_permvar_sf_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x float> [[TMP3]] +; + %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> ) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 +} + +declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) + +define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) { +; +; CHECK-LABEL: @identity_test_permvar_di_512( +; CHECK-NEXT: ret <8 x i64> [[A0:%.*]] +; + %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) + ret <8 x i64> %1 +} + +define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; +; CHECK-LABEL: @identity_test_permvar_di_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[A0:%.*]], <8 x i64> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x i64> [[TMP2]] +; + %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru + ret <8 x i64> %3 +} + +define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) { +; +; CHECK-LABEL: @zero_test_permvar_di_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; + %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer) + ret <8 x i64> %1 +} + +define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; +; CHECK-LABEL: @zero_test_permvar_di_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru + ret <8 x i64> %3 +} + +define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) { +; +; CHECK-LABEL: @shuffle_test_permvar_di_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; + %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) + ret <8 x i64> %1 +} + +define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; +; CHECK-LABEL: @shuffle_test_permvar_di_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru + ret <8 x i64> %3 +} + +define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) { +; +; CHECK-LABEL: @undef_test_permvar_di_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; + %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) + ret <8 x i64> %1 +} + +define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; +; CHECK-LABEL: @undef_test_permvar_di_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> ) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru + ret <8 x i64> %3 +} + +declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) + +define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) { +; +; CHECK-LABEL: @identity_test_permvar_df_512( +; CHECK-NEXT: ret <8 x double> [[A0:%.*]] +; + %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) + ret <8 x double> %1 +} + +define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; +; CHECK-LABEL: @identity_test_permvar_df_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[A0:%.*]], <8 x double> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x double> [[TMP2]] +; + %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru + ret <8 x double> %3 +} + +define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) { +; +; CHECK-LABEL: @zero_test_permvar_df_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: ret <8 x double> [[TMP1]] +; + %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer) + ret <8 x double> %1 +} + +define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; +; CHECK-LABEL: @zero_test_permvar_df_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x double> [[TMP3]] +; + %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru + ret <8 x double> %3 +} + +define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) { +; +; CHECK-LABEL: @shuffle_test_permvar_df_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> +; CHECK-NEXT: ret <8 x double> [[TMP1]] +; + %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) + ret <8 x double> %1 +} + +define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; +; CHECK-LABEL: @shuffle_test_permvar_df_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x double> [[TMP3]] +; + %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru + ret <8 x double> %3 +} + +define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) { +; +; CHECK-LABEL: @undef_test_permvar_df_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> +; CHECK-NEXT: ret <8 x double> [[TMP1]] +; + %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) + ret <8 x double> %1 +} + +define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; +; CHECK-LABEL: @undef_test_permvar_df_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x double> [[TMP3]] +; + %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> ) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru + ret <8 x double> %3 +} + +declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>) + +define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) { +; +; CHECK-LABEL: @identity_test_permvar_hi_128( +; CHECK-NEXT: ret <8 x i16> [[A0:%.*]] +; + %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { +; +; CHECK-LABEL: @identity_test_permvar_hi_128_mask( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A0:%.*]], <8 x i16> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x i16> [[TMP2]] +; + %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru + ret <8 x i16> %3 +} + +define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) { +; +; CHECK-LABEL: @zero_test_permvar_hi_128( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: ret <8 x i16> [[TMP1]] +; + %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer) + ret <8 x i16> %1 +} + +define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { +; +; CHECK-LABEL: @zero_test_permvar_hi_128_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x i16> [[TMP3]] +; + %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru + ret <8 x i16> %3 +} + +define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) { +; +; CHECK-LABEL: @shuffle_test_permvar_hi_128( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[TMP1]] +; + %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { +; +; CHECK-LABEL: @shuffle_test_permvar_hi_128_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x i16> [[TMP3]] +; + %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru + ret <8 x i16> %3 +} + +define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) { +; +; CHECK-LABEL: @undef_test_permvar_hi_128( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[TMP1]] +; + %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { +; +; CHECK-LABEL: @undef_test_permvar_hi_128_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <8 x i16> [[TMP3]] +; + %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> ) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru + ret <8 x i16> %3 +} + +declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>) + +define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) { +; +; CHECK-LABEL: @identity_test_permvar_hi_256( +; CHECK-NEXT: ret <16 x i16> [[A0:%.*]] +; + %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { +; +; CHECK-LABEL: @identity_test_permvar_hi_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[A0:%.*]], <16 x i16> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x i16> [[TMP2]] +; + %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru + ret <16 x i16> %3 +} + +define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) { +; +; CHECK-LABEL: @zero_test_permvar_hi_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; + %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer) + ret <16 x i16> %1 +} + +define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { +; +; CHECK-LABEL: @zero_test_permvar_hi_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x i16> [[TMP3]] +; + %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru + ret <16 x i16> %3 +} + +define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) { +; +; CHECK-LABEL: @shuffle_test_permvar_hi_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; + %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { +; +; CHECK-LABEL: @shuffle_test_permvar_hi_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x i16> [[TMP3]] +; + %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru + ret <16 x i16> %3 +} + +define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) { +; +; CHECK-LABEL: @undef_test_permvar_hi_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; + %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { +; +; CHECK-LABEL: @undef_test_permvar_hi_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x i16> [[TMP3]] +; + %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> ) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru + ret <16 x i16> %3 +} + +declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) + +define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) { +; +; CHECK-LABEL: @identity_test_permvar_hi_512( +; CHECK-NEXT: ret <32 x i16> [[A0:%.*]] +; + %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) + ret <32 x i16> %1 +} + +define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; +; CHECK-LABEL: @identity_test_permvar_hi_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> [[A0:%.*]], <32 x i16> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <32 x i16> [[TMP2]] +; + %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru + ret <32 x i16> %3 +} + +define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) { +; +; CHECK-LABEL: @zero_test_permvar_hi_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; + %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer) + ret <32 x i16> %1 +} + +define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; +; CHECK-LABEL: @zero_test_permvar_hi_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <32 x i16> [[TMP3]] +; + %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru + ret <32 x i16> %3 +} + +define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) { +; +; CHECK-LABEL: @shuffle_test_permvar_hi_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; + %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) + ret <32 x i16> %1 +} + +define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; +; CHECK-LABEL: @shuffle_test_permvar_hi_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <32 x i16> [[TMP3]] +; + %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru + ret <32 x i16> %3 +} + +define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) { +; +; CHECK-LABEL: @undef_test_permvar_hi_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; + %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) + ret <32 x i16> %1 +} + +define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; +; CHECK-LABEL: @undef_test_permvar_hi_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <32 x i16> [[TMP3]] +; + %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> ) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru + ret <32 x i16> %3 +} + +declare <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8>, <16 x i8>) + +define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) { +; +; CHECK-LABEL: @identity_test_permvar_qi_128( +; CHECK-NEXT: ret <16 x i8> [[A0:%.*]] +; + %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { +; +; CHECK-LABEL: @identity_test_permvar_qi_128_mask( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A0:%.*]], <16 x i8> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x i8> [[TMP2]] +; + %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru + ret <16 x i8> %3 +} + +define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) { +; +; CHECK-LABEL: @zero_test_permvar_qi_128( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer) + ret <16 x i8> %1 +} + +define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { +; +; CHECK-LABEL: @zero_test_permvar_qi_128_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x i8> [[TMP3]] +; + %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru + ret <16 x i8> %3 +} + +define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) { +; +; CHECK-LABEL: @shuffle_test_permvar_qi_128( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { +; +; CHECK-LABEL: @shuffle_test_permvar_qi_128_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x i8> [[TMP3]] +; + %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru + ret <16 x i8> %3 +} + +define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) { +; +; CHECK-LABEL: @undef_test_permvar_qi_128( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { +; +; CHECK-LABEL: @undef_test_permvar_qi_128_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <16 x i8> [[TMP3]] +; + %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> ) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru + ret <16 x i8> %3 +} + +declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>) + +define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) { +; +; CHECK-LABEL: @identity_test_permvar_qi_256( +; CHECK-NEXT: ret <32 x i8> [[A0:%.*]] +; + %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { +; +; CHECK-LABEL: @identity_test_permvar_qi_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A0:%.*]], <32 x i8> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <32 x i8> [[TMP2]] +; + %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru + ret <32 x i8> %3 +} + +define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) { +; +; CHECK-LABEL: @zero_test_permvar_qi_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: ret <32 x i8> [[TMP1]] +; + %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer) + ret <32 x i8> %1 +} + +define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { +; +; CHECK-LABEL: @zero_test_permvar_qi_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <32 x i8> [[TMP3]] +; + %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru + ret <32 x i8> %3 +} + +define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) { +; +; CHECK-LABEL: @shuffle_test_permvar_qi_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: ret <32 x i8> [[TMP1]] +; + %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { +; +; CHECK-LABEL: @shuffle_test_permvar_qi_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <32 x i8> [[TMP3]] +; + %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru + ret <32 x i8> %3 +} + +define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) { +; +; CHECK-LABEL: @undef_test_permvar_qi_256( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: ret <32 x i8> [[TMP1]] +; + %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { +; +; CHECK-LABEL: @undef_test_permvar_qi_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <32 x i8> [[TMP3]] +; + %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> ) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru + ret <32 x i8> %3 +} + +declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) + +define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) { +; +; CHECK-LABEL: @identity_test_permvar_qi_512( +; CHECK-NEXT: ret <64 x i8> [[A0:%.*]] +; + %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) + ret <64 x i8> %1 +} + +define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { +; +; CHECK-LABEL: @identity_test_permvar_qi_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> [[A0:%.*]], <64 x i8> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <64 x i8> [[TMP2]] +; + %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru + ret <64 x i8> %3 +} + +define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) { +; +; CHECK-LABEL: @zero_test_permvar_qi_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer +; CHECK-NEXT: ret <64 x i8> [[TMP1]] +; + %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer) + ret <64 x i8> %1 +} + +define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { +; +; CHECK-LABEL: @zero_test_permvar_qi_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <64 x i8> [[TMP3]] +; + %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru + ret <64 x i8> %3 +} + +define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) { +; +; CHECK-LABEL: @shuffle_test_permvar_qi_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: ret <64 x i8> [[TMP1]] +; + %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) + ret <64 x i8> %1 +} + +define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { +; +; CHECK-LABEL: @shuffle_test_permvar_qi_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <64 x i8> [[TMP3]] +; + %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru + ret <64 x i8> %3 +} + +define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) { +; +; CHECK-LABEL: @undef_test_permvar_qi_512( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: ret <64 x i8> [[TMP1]] +; + %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) + ret <64 x i8> %1 +} + +define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { +; +; CHECK-LABEL: @undef_test_permvar_qi_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]] +; CHECK-NEXT: ret <64 x i8> [[TMP3]] +; + %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> ) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru + ret <64 x i8> %3 +} From 6345604ae51df1251de5b5fd442910f4d8f5023e Mon Sep 17 00:00:00 2001 From: Danial Klimkin Date: Fri, 30 Aug 2024 13:39:30 +0200 Subject: [PATCH 17/98] Revert: [AMDGPU] Graph-based Module Splitting Rewrite (llvm#104763) (#106707) * Revert "Fix MSVC "not all control paths return a value" warning. NFC." Dep to revert c9b6e01b2e4fc930dac91dd44c0592ad7e36d967 * Revert "[AMDGPU] Graph-based Module Splitting Rewrite (#104763)" Breaks tests. --- llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp | 1803 +++++------------ .../address-taken-externalize-with-call.ll | 36 +- .../AMDGPU/address-taken-externalize.ll | 2 +- .../llvm-split/AMDGPU/debug-name-hiding.ll | 20 + .../AMDGPU/debug-non-kernel-root.ll | 36 + .../tools/llvm-split/AMDGPU/declarations.ll | 9 +- .../AMDGPU/kernels-alias-dependencies.ll | 18 +- .../llvm-split/AMDGPU/kernels-cost-ranking.ll | 12 +- .../AMDGPU/kernels-dependency-external.ll | 33 +- .../AMDGPU/kernels-dependency-indirect.ll | 30 +- .../AMDGPU/kernels-dependency-overridable.ll | 28 +- .../kernels-global-variables-noexternal.ll | 12 +- .../AMDGPU/kernels-global-variables.ll | 12 +- .../AMDGPU/large-kernels-merging.ll | 26 +- .../AMDGPU/non-kernels-dependency-indirect.ll | 30 +- .../llvm-split/AMDGPU/recursive-search-2.ll | 128 -- .../llvm-split/AMDGPU/recursive-search-8.ll | 128 -- 17 files changed, 738 insertions(+), 1625 deletions(-) create mode 100644 llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll delete mode 100644 llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll delete mode 100644 llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index a5807a70582b39..df084cf41c4783 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -7,36 +7,33 @@ //===----------------------------------------------------------------------===// // /// \file Implements a module splitting algorithm designed to support the -/// FullLTO --lto-partitions option for parallel codegen. +/// FullLTO --lto-partitions option for parallel codegen. This is completely +/// different from the common SplitModule pass, as this system is designed with +/// AMDGPU in mind. /// -/// The role of this module splitting pass is the same as -/// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions -/// across a set of N partitions to allow for parallel codegen. +/// The basic idea of this module splitting implementation is the same as +/// SplitModule: load-balance the module's functions across a set of N +/// partitions to allow parallel codegen. However, it does it very +/// differently than the target-agnostic variant: +/// - The module has "split roots", which are kernels in the vast +// majority of cases. +/// - Each root has a set of dependencies, and when a root and its +/// dependencies is considered "big", we try to put it in a partition where +/// most dependencies are already imported, to avoid duplicating large +/// amounts of code. +/// - There's special care for indirect calls in order to ensure +/// AMDGPUResourceUsageAnalysis can work correctly. /// -/// The similarities mostly end here, as this pass achieves load-balancing in a -/// more elaborate fashion which is targeted towards AMDGPU modules. It can take -/// advantage of the structure of AMDGPU modules (which are mostly -/// self-contained) to allow for more efficient splitting without affecting -/// codegen negatively, or causing innaccurate resource usage analysis. -/// -/// High-level pass overview: -/// - SplitGraph & associated classes -/// - Graph representation of the module and of the dependencies that -/// matter for splitting. -/// - RecursiveSearchSplitting -/// - Core splitting algorithm. -/// - SplitProposal -/// - Represents a suggested solution for splitting the input module. These -/// solutions can be scored to determine the best one when multiple -/// solutions are available. -/// - Driver/pass "run" function glues everything together. +/// This file also includes a more elaborate logging system to enable +/// users to easily generate logs that (if desired) do not include any value +/// names, in order to not leak information about the source file. +/// Such logs are very helpful to understand and fix potential issues with +/// module splitting. #include "AMDGPUSplitModule.h" #include "AMDGPUTargetMachine.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/EquivalenceClasses.h" -#include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" @@ -47,56 +44,44 @@ #include "llvm/IR/Module.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" -#include "llvm/Support/Allocator.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/DOTGraphTraits.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" -#include "llvm/Support/GraphWriter.h" #include "llvm/Support/Path.h" -#include "llvm/Support/Timer.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/SHA256.h" +#include "llvm/Support/Threading.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Cloning.h" #include #include -#include #include #include #include #include -#ifndef NDEBUG -#include "llvm/Support/LockFileManager.h" -#endif +using namespace llvm; #define DEBUG_TYPE "amdgpu-split-module" -namespace llvm { namespace { -static cl::opt MaxDepth( - "amdgpu-module-splitting-max-depth", - cl::desc( - "maximum search depth. 0 forces a greedy approach. " - "warning: the algorithm is up to O(2^N), where N is the max depth."), - cl::init(8)); - static cl::opt LargeFnFactor( - "amdgpu-module-splitting-large-threshold", cl::init(2.0f), cl::Hidden, + "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f), + cl::Hidden, cl::desc( - "when max depth is reached and we can no longer branch out, this " - "value determines if a function is worth merging into an already " - "existing partition to reduce code duplication. This is a factor " - "of the ideal partition size, e.g. 2.0 means we consider the " - "function for merging if its cost (including its callees) is 2x the " - "size of an ideal partition.")); + "consider a function as large and needing special treatment when the " + "cost of importing it into a partition" + "exceeds the average cost of a partition by this factor; e;g. 2.0 " + "means if the function and its dependencies is 2 times bigger than " + "an average partition; 0 disables large functions handling entirely")); static cl::opt LargeFnOverlapForMerge( - "amdgpu-module-splitting-merge-threshold", cl::init(0.7f), cl::Hidden, - cl::desc("when a function is considered for merging into a partition that " - "already contains some of its callees, do the merge if at least " - "n% of the code it can reach is already present inside the " - "partition; e.g. 0.7 means only merge >70%")); + "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f), + cl::Hidden, + cl::desc( + "defines how much overlap between two large function's dependencies " + "is needed to put them in the same partition")); static cl::opt NoExternalizeGlobals( "amdgpu-module-splitting-no-externalize-globals", cl::Hidden, @@ -104,92 +89,142 @@ static cl::opt NoExternalizeGlobals( "may cause globals to be duplicated which increases binary size")); static cl::opt - ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg", - cl::Hidden, - cl::desc("output file to write out the dotgraph " - "representation of the input module")); + LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden, + cl::desc("output directory for AMDGPU module splitting logs")); -static cl::opt PartitionSummariesOutput( - "amdgpu-module-splitting-print-partition-summaries", cl::Hidden, - cl::desc("output file to write out a summary of " - "the partitions created for each module")); - -#ifndef NDEBUG static cl::opt - UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden, - cl::desc("use a lock file so only one process in the system " - "can run this pass at once. useful to avoid mangled " - "debug output in multithreaded environments.")); + LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden, + cl::desc("hash value names before printing them in the AMDGPU " + "module splitting logs")); -static cl::opt - DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search", - cl::Hidden, - cl::desc("print all proposals received and whether " - "they were rejected or accepted")); -#endif +using CostType = InstructionCost::CostType; +using PartitionID = unsigned; +using GetTTIFn = function_ref; -struct SplitModuleTimer : NamedRegionTimer { - SplitModuleTimer(StringRef Name, StringRef Desc) - : NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting", - TimePassesIsEnabled) {} -}; +static bool isEntryPoint(const Function *F) { + return AMDGPU::isEntryFunctionCC(F->getCallingConv()); +} -//===----------------------------------------------------------------------===// -// Utils -//===----------------------------------------------------------------------===// +static std::string getName(const Value &V) { + static bool HideNames; -using CostType = InstructionCost::CostType; -using FunctionsCostMap = DenseMap; -using GetTTIFn = function_ref; -static constexpr unsigned InvalidPID = -1; + static llvm::once_flag HideNameInitFlag; + llvm::call_once(HideNameInitFlag, [&]() { + if (LogPrivate.getNumOccurrences()) + HideNames = LogPrivate; + else { + const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE"); + HideNames = (EV.value_or("0") != "0"); + } + }); -/// \param Num numerator -/// \param Dem denominator -/// \returns a printable object to print (Num/Dem) using "%0.2f". -static auto formatRatioOf(CostType Num, CostType Dem) { - return format("%0.2f", (static_cast(Num) / Dem) * 100); + if (!HideNames) + return V.getName().str(); + return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())), + /*LowerCase=*/true); } -/// Checks whether a given function is non-copyable. +/// Main logging helper. /// -/// Non-copyable functions cannot be cloned into multiple partitions, and only -/// one copy of the function can be present across all partitions. +/// Logging can be configured by the following environment variable. +/// AMD_SPLIT_MODULE_LOG_DIR= +/// If set, uses as the directory to write logfiles to +/// each time module splitting is used. +/// AMD_SPLIT_MODULE_LOG_PRIVATE +/// If set to anything other than zero, all names are hidden. /// -/// External functions fall into this category. If we were to clone them, we -/// would end up with multiple symbol definitions and a very unhappy linker. -static bool isNonCopyable(const Function &F) { - assert(AMDGPU::isEntryFunctionCC(F.getCallingConv()) - ? F.hasExternalLinkage() - : true && "Kernel w/o external linkage?"); - return F.hasExternalLinkage() || !F.isDefinitionExact(); -} +/// Both environment variables have corresponding CL options which +/// takes priority over them. +/// +/// Any output printed to the log files is also printed to dbgs() when -debug is +/// used and LLVM_DEBUG is defined. +/// +/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic +/// cannot be removed from the code (by building without debug). This probably +/// has a small performance cost because if some computation/formatting is +/// needed for logging purpose, it may be done everytime only to be ignored +/// by the logger. +/// +/// As this pass only runs once and is not doing anything computationally +/// expensive, this is likely a reasonable trade-off. +/// +/// If some computation should really be avoided when unused, users of the class +/// can check whether any logging will occur by using the bool operator. +/// +/// \code +/// if (SML) { +/// // Executes only if logging to a file or if -debug is available and +/// used. +/// } +/// \endcode +class SplitModuleLogger { +public: + SplitModuleLogger(const Module &M) { + std::string LogDir = LogDirOpt; + if (LogDir.empty()) + LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or(""); + + // No log dir specified means we don't need to log to a file. + // We may still log to dbgs(), though. + if (LogDir.empty()) + return; + + // If a log directory is specified, create a new file with a unique name in + // that directory. + int Fd; + SmallString<0> PathTemplate; + SmallString<0> RealPath; + sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt"); + if (auto Err = + sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) { + report_fatal_error("Failed to create log file at '" + Twine(LogDir) + + "': " + Err.message(), + /*CrashDiag=*/false); + } -/// If \p GV has local linkage, make it external + hidden. -static void externalize(GlobalValue &GV) { - if (GV.hasLocalLinkage()) { - GV.setLinkage(GlobalValue::ExternalLinkage); - GV.setVisibility(GlobalValue::HiddenVisibility); + FileOS = std::make_unique(Fd, /*shouldClose=*/true); } - // Unnamed entities must be named consistently between modules. setName will - // give a distinct name to each such entity. - if (!GV.hasName()) - GV.setName("__llvmsplit_unnamed"); + bool hasLogFile() const { return FileOS != nullptr; } + + raw_ostream &logfile() { + assert(FileOS && "no logfile!"); + return *FileOS; + } + + /// \returns true if this SML will log anything either to a file or dbgs(). + /// Can be used to avoid expensive computations that are ignored when logging + /// is disabled. + operator bool() const { + return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE)); + } + +private: + std::unique_ptr FileOS; +}; + +template +static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) { + static_assert( + !std::is_same_v, + "do not print values to logs directly, use handleName instead!"); + LLVM_DEBUG(dbgs() << Val); + if (SML.hasLogFile()) + SML.logfile() << Val; + return SML; } -/// Cost analysis function. Calculates the cost of each function in \p M -/// +/// Calculate the cost of each function in \p M +/// \param SML Log Helper /// \param GetTTI Abstract getter for TargetTransformInfo. /// \param M Module to analyze. /// \param CostMap[out] Resulting Function -> Cost map. /// \return The module's total cost. -static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M, - FunctionsCostMap &CostMap) { - SplitModuleTimer SMT("calculateFunctionCosts", "cost analysis"); - - LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n"); +static CostType +calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M, + DenseMap &CostMap) { CostType ModuleCost = 0; - [[maybe_unused]] CostType KernelCost = 0; + CostType KernelCost = 0; for (auto &Fn : M) { if (Fn.isDeclaration()) @@ -216,30 +251,23 @@ static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M, assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!"); ModuleCost += FnCost; - if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) + if (isEntryPoint(&Fn)) KernelCost += FnCost; } - if (CostMap.empty()) - return 0; - - assert(ModuleCost); - LLVM_DEBUG({ - const CostType FnCost = ModuleCost - KernelCost; - dbgs() << " - total module cost is " << ModuleCost << ". kernels cost " - << "" << KernelCost << " (" - << format("%0.2f", (float(KernelCost) / ModuleCost) * 100) - << "% of the module), functions cost " << FnCost << " (" - << format("%0.2f", (float(FnCost) / ModuleCost) * 100) - << "% of the module)\n"; - }); + CostType FnCost = (ModuleCost - KernelCost); + CostType ModuleCostOr1 = ModuleCost ? ModuleCost : 1; + SML << "=> Total Module Cost: " << ModuleCost << '\n' + << " => KernelCost: " << KernelCost << " (" + << format("%0.2f", (float(KernelCost) / ModuleCostOr1) * 100) << "%)\n" + << " => FnsCost: " << FnCost << " (" + << format("%0.2f", (float(FnCost) / ModuleCostOr1) * 100) << "%)\n"; return ModuleCost; } -/// \return true if \p F can be indirectly called static bool canBeIndirectlyCalled(const Function &F) { - if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(F.getCallingConv())) + if (F.isDeclaration() || isEntryPoint(&F)) return false; return !F.hasLocalLinkage() || F.hasAddressTaken(/*PutOffender=*/nullptr, @@ -250,1081 +278,351 @@ static bool canBeIndirectlyCalled(const Function &F) { /*IgnoreCastedDirectCall=*/true); } -//===----------------------------------------------------------------------===// -// Graph-based Module Representation -//===----------------------------------------------------------------------===// - -/// AMDGPUSplitModule's view of the source Module, as a graph of all components -/// that can be split into different modules. -/// -/// The most trivial instance of this graph is just the CallGraph of the module, -/// but it is not guaranteed that the graph is strictly equal to the CG. It -/// currently always is but it's designed in a way that would eventually allow -/// us to create abstract nodes, or nodes for different entities such as global -/// variables or any other meaningful constraint we must consider. +/// When a function or any of its callees performs an indirect call, this +/// takes over \ref addAllDependencies and adds all potentially callable +/// functions to \p Fns so they can be counted as dependencies of the function. /// -/// The graph is only mutable by this class, and is generally not modified -/// after \ref SplitGraph::buildGraph runs. No consumers of the graph can -/// mutate it. -class SplitGraph { -public: - class Node; - - enum class EdgeKind : uint8_t { - /// The nodes are related through a direct call. This is a "strong" edge as - /// it means the Src will directly reference the Dst. - DirectCall, - /// The nodes are related through an indirect call. - /// This is a "weaker" edge and is only considered when traversing the graph - /// starting from a kernel. We need this edge for resource usage analysis. - /// - /// The reason why we have this edge in the first place is due to how - /// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call, - /// the resource usage of the kernel containing the indirect call is the - /// max resource usage of all functions that can be indirectly called. - IndirectCall, - }; - - /// An edge between two nodes. Edges are directional, and tagged with a - /// "kind". - struct Edge { - Edge(Node *Src, Node *Dst, EdgeKind Kind) - : Src(Src), Dst(Dst), Kind(Kind) {} - - Node *Src; ///< Source - Node *Dst; ///< Destination - EdgeKind Kind; - }; - - using EdgesVec = SmallVector; - using edges_iterator = EdgesVec::const_iterator; - using nodes_iterator = const Node *const *; - - SplitGraph(const Module &M, const FunctionsCostMap &CostMap, - CostType ModuleCost) - : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {} - - void buildGraph(CallGraph &CG); - -#ifndef NDEBUG - bool verifyGraph() const; -#endif - - bool empty() const { return Nodes.empty(); } - const iterator_range nodes() const { - return {Nodes.begin(), Nodes.end()}; +/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the +/// presence of an indirect call, the function's resource usage is the same as +/// the most expensive function in the module. +/// \param M The module. +/// \param Fns[out] Resulting list of functions. +static void addAllIndirectCallDependencies(const Module &M, + DenseSet &Fns) { + for (const auto &Fn : M) { + if (canBeIndirectlyCalled(Fn)) + Fns.insert(&Fn); } - const Node &getNode(unsigned ID) const { return *Nodes[ID]; } - - unsigned getNumNodes() const { return Nodes.size(); } - BitVector createNodesBitVector() const { return BitVector(Nodes.size()); } - - const Module &getModule() const { return M; } - - CostType getModuleCost() const { return ModuleCost; } - CostType getCost(const Function &F) const { return CostMap.at(&F); } - - /// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node - /// IDs). - CostType calculateCost(const BitVector &BV) const; - -private: - /// Retrieves the node for \p GV in \p Cache, or creates a new node for it and - /// updates \p Cache. - Node &getNode(DenseMap &Cache, - const GlobalValue &GV); - - // Create a new edge between two nodes and add it to both nodes. - const Edge &createEdge(Node &Src, Node &Dst, EdgeKind EK); - - const Module &M; - const FunctionsCostMap &CostMap; - CostType ModuleCost; - - // Final list of nodes with stable ordering. - SmallVector Nodes; - - SpecificBumpPtrAllocator NodesPool; - - // Edges are trivially destructible objects, so as a small optimization we - // use a BumpPtrAllocator which avoids destructor calls but also makes - // allocation faster. - static_assert( - std::is_trivially_destructible_v, - "Edge must be trivially destructible to use the BumpPtrAllocator"); - BumpPtrAllocator EdgesPool; -}; +} -/// Nodes in the SplitGraph contain both incoming, and outgoing edges. -/// Incoming edges have this node as their Dst, and Outgoing ones have this node -/// as their Src. +/// Adds the functions that \p Fn may call to \p Fns, then recurses into each +/// callee until all reachable functions have been gathered. /// -/// Edge objects are shared by both nodes in Src/Dst. They provide immediate -/// feedback on how two nodes are related, and in which direction they are -/// related, which is valuable information to make splitting decisions. -/// -/// Nodes are fundamentally abstract, and any consumers of the graph should -/// treat them as such. While a node will be a function most of the time, we -/// could also create nodes for any other reason. In the future, we could have -/// single nodes for multiple functions, or nodes for GVs, etc. -class SplitGraph::Node { - friend class SplitGraph; - -public: - Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost, - bool IsNonCopyable) - : ID(ID), GV(GV), IndividualCost(IndividualCost), - IsNonCopyable(IsNonCopyable), IsEntryFnCC(false), IsGraphEntry(false) { - if (auto *Fn = dyn_cast(&GV)) - IsEntryFnCC = AMDGPU::isEntryFunctionCC(Fn->getCallingConv()); - } - - /// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of - /// nodes in the graph. This ID can be used as an index in a BitVector. - unsigned getID() const { return ID; } - - const Function &getFunction() const { return cast(GV); } - - /// \returns the cost to import this component into a given module, not - /// accounting for any dependencies that may need to be imported as well. - CostType getIndividualCost() const { return IndividualCost; } - - bool isNonCopyable() const { return IsNonCopyable; } - bool isEntryFunctionCC() const { return IsEntryFnCC; } - - /// \returns whether this is an entry point in the graph. Entry points are - /// defined as follows: if you take all entry points in the graph, and iterate - /// their dependencies, you are guaranteed to visit all nodes in the graph at - /// least once. - bool isGraphEntryPoint() const { return IsGraphEntry; } - - StringRef getName() const { return GV.getName(); } - - bool hasAnyIncomingEdges() const { return IncomingEdges.size(); } - bool hasAnyIncomingEdgesOfKind(EdgeKind EK) const { - return any_of(IncomingEdges, [&](const auto *E) { return E->Kind == EK; }); - } - - bool hasAnyOutgoingEdges() const { return OutgoingEdges.size(); } - bool hasAnyOutgoingEdgesOfKind(EdgeKind EK) const { - return any_of(OutgoingEdges, [&](const auto *E) { return E->Kind == EK; }); - } - - iterator_range incoming_edges() const { - return IncomingEdges; - } - - iterator_range outgoing_edges() const { - return OutgoingEdges; - } - - bool shouldFollowIndirectCalls() const { return isEntryFunctionCC(); } - - /// Visit all children of this node in a recursive fashion. Also visits Self. - /// If \ref shouldFollowIndirectCalls returns false, then this only follows - /// DirectCall edges. - /// - /// \param Visitor Visitor Function. - void visitAllDependencies(std::function Visitor) const; - - /// Adds the depedencies of this node in \p BV by setting the bit - /// corresponding to each node. - /// - /// Implemented using \ref visitAllDependencies, hence it follows the same - /// rules regarding dependencies traversal. - /// - /// \param[out] BV The bitvector where the bits should be set. - void getDependencies(BitVector &BV) const { - visitAllDependencies([&](const Node &N) { BV.set(N.getID()); }); - } - - /// Uses \ref visitAllDependencies to aggregate the individual cost of this - /// node and all of its dependencies. - /// - /// This is cached. - CostType getFullCost() const; - -private: - void markAsGraphEntry() { IsGraphEntry = true; } - - unsigned ID; - const GlobalValue &GV; - CostType IndividualCost; - bool IsNonCopyable : 1; - bool IsEntryFnCC : 1; - bool IsGraphEntry : 1; - - // TODO: Cache dependencies as well? - mutable CostType FullCost = 0; - - // TODO: Use a single sorted vector (with all incoming/outgoing edges grouped - // together) - EdgesVec IncomingEdges; - EdgesVec OutgoingEdges; -}; - -void SplitGraph::Node::visitAllDependencies( - std::function Visitor) const { - const bool FollowIndirect = shouldFollowIndirectCalls(); - // FIXME: If this can access SplitGraph in the future, use a BitVector - // instead. - DenseSet Seen; - SmallVector WorkList({this}); +/// \param SML Log Helper +/// \param CG Call graph for \p Fn's module. +/// \param Fn Current function to look at. +/// \param Fns[out] Resulting list of functions. +/// \param OnlyDirect Whether to only consider direct callees. +/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some +/// point, either in \p Fn or in one of the function it calls. When that +/// happens, we fall back to adding all callable functions inside \p Fn's module +/// to \p Fns. +static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, + const Function &Fn, + DenseSet &Fns, bool OnlyDirect, + bool &HadIndirectCall) { + assert(!Fn.isDeclaration()); + + const Module &M = *Fn.getParent(); + SmallVector WorkList({&Fn}); while (!WorkList.empty()) { - const Node *CurN = WorkList.pop_back_val(); - if (auto [It, Inserted] = Seen.insert(CurN); !Inserted) - continue; - - Visitor(*CurN); - - for (const Edge *E : CurN->outgoing_edges()) { - if (!FollowIndirect && E->Kind == EdgeKind::IndirectCall) - continue; - WorkList.push_back(E->Dst); - } - } -} - -CostType SplitGraph::Node::getFullCost() const { - if (FullCost) - return FullCost; - - assert(FullCost == 0); - visitAllDependencies( - [&](const Node &N) { FullCost += N.getIndividualCost(); }); - return FullCost; -} + const auto &CurFn = *WorkList.pop_back_val(); + assert(!CurFn.isDeclaration()); -void SplitGraph::buildGraph(CallGraph &CG) { - SplitModuleTimer SMT("buildGraph", "graph construction"); - LLVM_DEBUG( - dbgs() - << "[build graph] constructing graph representation of the input\n"); - - // We build the graph by just iterating all functions in the module and - // working on their direct callees. At the end, all nodes should be linked - // together as expected. - DenseMap Cache; - SmallVector FnsWithIndirectCalls, IndirectlyCallableFns; - for (const Function &Fn : M) { - if (Fn.isDeclaration()) - continue; + // Scan for an indirect call. If such a call is found, we have to + // conservatively assume this can call all non-entrypoint functions in the + // module. - // Look at direct callees and create the necessary edges in the graph. - bool HasIndirectCall = false; - Node &N = getNode(Cache, Fn); - for (auto &CGEntry : *CG[&Fn]) { + for (auto &CGEntry : *CG[&CurFn]) { auto *CGNode = CGEntry.second; auto *Callee = CGNode->getFunction(); if (!Callee) { - // TODO: Don't consider inline assembly as indirect calls. - if (CGNode == CG.getCallsExternalNode()) - HasIndirectCall = true; + if (OnlyDirect) + continue; + + // Functions have an edge towards CallsExternalNode if they're external + // declarations, or if they do an indirect call. As we only process + // definitions here, we know this means the function has an indirect + // call. We then have to conservatively assume this can call all + // non-entrypoint functions in the module. + if (CGNode != CG.getCallsExternalNode()) + continue; // this is another function-less node we don't care about. + + SML << "Indirect call detected in " << getName(CurFn) + << " - treating all non-entrypoint functions as " + "potential dependencies\n"; + + // TODO: Print an ORE as well ? + addAllIndirectCallDependencies(M, Fns); + HadIndirectCall = true; continue; } - if (!Callee->isDeclaration()) - createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall); - } - - // Keep track of this function if it contains an indirect call and/or if it - // can be indirectly called. - if (HasIndirectCall) { - LLVM_DEBUG(dbgs() << "indirect call found in " << Fn.getName() << "\n"); - FnsWithIndirectCalls.push_back(&Fn); - } - - if (canBeIndirectlyCalled(Fn)) - IndirectlyCallableFns.push_back(&Fn); - } + if (Callee->isDeclaration()) + continue; - // Post-process functions with indirect calls. - for (const Function *Fn : FnsWithIndirectCalls) { - for (const Function *Candidate : IndirectlyCallableFns) { - Node &Src = getNode(Cache, *Fn); - Node &Dst = getNode(Cache, *Candidate); - createEdge(Src, Dst, EdgeKind::IndirectCall); + auto [It, Inserted] = Fns.insert(Callee); + if (Inserted) + WorkList.push_back(Callee); } } - - // Now, find all entry points. - SmallVector CandidateEntryPoints; - BitVector NodesReachableByKernels = createNodesBitVector(); - for (Node *N : Nodes) { - // Functions with an Entry CC are always graph entry points too. - if (N->isEntryFunctionCC()) { - N->markAsGraphEntry(); - N->getDependencies(NodesReachableByKernels); - } else if (!N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall)) - CandidateEntryPoints.push_back(N); - } - - for (Node *N : CandidateEntryPoints) { - // This can be another entry point if it's not reachable by a kernel - // TODO: We could sort all of the possible new entries in a stable order - // (e.g. by cost), then consume them one by one until - // NodesReachableByKernels is all 1s. It'd allow us to avoid - // considering some nodes as non-entries in some specific cases. - if (!NodesReachableByKernels.test(N->getID())) - N->markAsGraphEntry(); - } - -#ifndef NDEBUG - assert(verifyGraph()); -#endif } -#ifndef NDEBUG -bool SplitGraph::verifyGraph() const { - unsigned ExpectedID = 0; - // Exceptionally using a set here in case IDs are messed up. - DenseSet SeenNodes; - DenseSet SeenFunctionNodes; - for (const Node *N : Nodes) { - if (N->getID() != (ExpectedID++)) { - errs() << "Node IDs are incorrect!\n"; - return false; - } - - if (!SeenNodes.insert(N).second) { - errs() << "Node seen more than once!\n"; - return false; - } - - if (&getNode(N->getID()) != N) { - errs() << "getNode doesn't return the right node\n"; - return false; - } - - for (const Edge *E : N->IncomingEdges) { - if (!E->Src || !E->Dst || (E->Dst != N) || - (find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) { - errs() << "ill-formed incoming edges\n"; - return false; - } - } - - for (const Edge *E : N->OutgoingEdges) { - if (!E->Src || !E->Dst || (E->Src != N) || - (find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) { - errs() << "ill-formed outgoing edges\n"; - return false; - } - } - - const Function &Fn = N->getFunction(); - if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) { - if (N->hasAnyIncomingEdges()) { - errs() << "Kernels cannot have incoming edges\n"; - return false; - } - } - - if (Fn.isDeclaration()) { - errs() << "declarations shouldn't have nodes!\n"; - return false; - } - - auto [It, Inserted] = SeenFunctionNodes.insert(&Fn); - if (!Inserted) { - errs() << "one function has multiple nodes!\n"; - return false; +/// Contains information about a function and its dependencies. +/// This is a splitting root. The splitting algorithm works by +/// assigning these to partitions. +struct FunctionWithDependencies { + FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG, + const DenseMap &FnCosts, + const Function *Fn) + : Fn(Fn) { + // When Fn is not a kernel, we don't need to collect indirect callees. + // Resource usage analysis is only performed on kernels, and we collect + // indirect callees for resource usage analysis. + addAllDependencies(SML, CG, *Fn, Dependencies, + /*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall); + TotalCost = FnCosts.at(Fn); + for (const auto *Dep : Dependencies) { + TotalCost += FnCosts.at(Dep); + + // We cannot duplicate functions with external linkage, or functions that + // may be overriden at runtime. + HasNonDuplicatableDependecy |= + (Dep->hasExternalLinkage() || !Dep->isDefinitionExact()); } } - if (ExpectedID != Nodes.size()) { - errs() << "Node IDs out of sync!\n"; - return false; - } - - if (createNodesBitVector().size() != getNumNodes()) { - errs() << "nodes bit vector doesn't have the right size!\n"; - return false; - } - - // Check we respect the promise of Node::isKernel - BitVector BV = createNodesBitVector(); - for (const Node *N : nodes()) { - if (N->isGraphEntryPoint()) - N->getDependencies(BV); - } - - // Ensure each function in the module has an associated node. - for (const auto &Fn : M) { - if (!Fn.isDeclaration()) { - if (!SeenFunctionNodes.contains(&Fn)) { - errs() << "Fn has no associated node in the graph!\n"; - return false; - } - } - } - - if (!BV.all()) { - errs() << "not all nodes are reachable through the graph's entry points!\n"; - return false; - } - - return true; -} -#endif - -CostType SplitGraph::calculateCost(const BitVector &BV) const { - CostType Cost = 0; - for (unsigned NodeID : BV.set_bits()) - Cost += getNode(NodeID).getIndividualCost(); - return Cost; -} - -SplitGraph::Node & -SplitGraph::getNode(DenseMap &Cache, - const GlobalValue &GV) { - auto &N = Cache[&GV]; - if (N) - return *N; - - CostType Cost = 0; - bool NonCopyable = false; - if (const Function *Fn = dyn_cast(&GV)) { - NonCopyable = isNonCopyable(*Fn); - Cost = CostMap.at(Fn); - } - N = new (NodesPool.Allocate()) Node(Nodes.size(), GV, Cost, NonCopyable); - Nodes.push_back(N); - assert(&getNode(N->getID()) == N); - return *N; -} - -const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst, - EdgeKind EK) { - const Edge *E = new (EdgesPool.Allocate(1)) Edge(&Src, &Dst, EK); - Src.OutgoingEdges.push_back(E); - Dst.IncomingEdges.push_back(E); - return *E; -} - -//===----------------------------------------------------------------------===// -// Split Proposals -//===----------------------------------------------------------------------===// - -/// Represents a module splitting proposal. -/// -/// Proposals are made of N BitVectors, one for each partition, where each bit -/// set indicates that the node is present and should be copied inside that -/// partition. -/// -/// Proposals have several metrics attached so they can be compared/sorted, -/// which the driver to try multiple strategies resultings in multiple proposals -/// and choose the best one out of them. -class SplitProposal { -public: - SplitProposal(const SplitGraph &SG, unsigned MaxPartitions) : SG(&SG) { - Partitions.resize(MaxPartitions, {0, SG.createNodesBitVector()}); - } + const Function *Fn = nullptr; + DenseSet Dependencies; + /// Whether \p Fn or any of its \ref Dependencies contains an indirect call. + bool HasIndirectCall = false; + /// Whether any of \p Fn's dependencies cannot be duplicated. + bool HasNonDuplicatableDependecy = false; - void setName(StringRef NewName) { Name = NewName; } - StringRef getName() const { return Name; } - - const BitVector &operator[](unsigned PID) const { - return Partitions[PID].second; - } - - void add(unsigned PID, const BitVector &BV) { - Partitions[PID].second |= BV; - updateScore(PID); - } - - void print(raw_ostream &OS) const; - LLVM_DUMP_METHOD void dump() const { print(dbgs()); } - - // Find the cheapest partition (lowest cost). In case of ties, always returns - // the highest partition number. - unsigned findCheapestPartition() const; - - /// Calculate the CodeSize and Bottleneck scores. - void calculateScores(); - -#ifndef NDEBUG - void verifyCompleteness() const; -#endif - - /// Only available after \ref calculateScores is called. - /// - /// A positive number indicating the % of code duplication that this proposal - /// creates. e.g. 0.2 means this proposal adds roughly 20% code size by - /// duplicating some functions across partitions. - /// - /// Value is always rounded up to 3 decimal places. - /// - /// A perfect score would be 0.0, and anything approaching 1.0 is very bad. - double getCodeSizeScore() const { return CodeSizeScore; } - - /// Only available after \ref calculateScores is called. - /// - /// A number between [0, 1] which indicates how big of a bottleneck is - /// expected from the largest partition. - /// - /// A score of 1.0 means the biggest partition is as big as the source module, - /// so build time will be equal to or greater than the build time of the - /// initial input. - /// - /// Value is always rounded up to 3 decimal places. - /// - /// This is one of the metrics used to estimate this proposal's build time. - double getBottleneckScore() const { return BottleneckScore; } - -private: - void updateScore(unsigned PID) { - assert(SG); - for (auto &[PCost, Nodes] : Partitions) { - TotalCost -= PCost; - PCost = SG->calculateCost(Nodes); - TotalCost += PCost; - } - } - - /// \see getCodeSizeScore - double CodeSizeScore = 0.0; - /// \see getBottleneckScore - double BottleneckScore = 0.0; - /// Aggregated cost of all partitions CostType TotalCost = 0; - const SplitGraph *SG = nullptr; - std::string Name; - - std::vector> Partitions; -}; - -void SplitProposal::print(raw_ostream &OS) const { - assert(SG); - - OS << "[proposal] " << Name << ", total cost:" << TotalCost - << ", code size score:" << format("%0.3f", CodeSizeScore) - << ", bottleneck score:" << format("%0.3f", BottleneckScore) << '\n'; - for (const auto &[PID, Part] : enumerate(Partitions)) { - const auto &[Cost, NodeIDs] = Part; - OS << " - P" << PID << " nodes:" << NodeIDs.count() << " cost: " << Cost - << '|' << formatRatioOf(Cost, SG->getModuleCost()) << "%\n"; - } -} - -unsigned SplitProposal::findCheapestPartition() const { - assert(!Partitions.empty()); - CostType CurCost = std::numeric_limits::max(); - unsigned CurPID = InvalidPID; - for (const auto &[Idx, Part] : enumerate(Partitions)) { - if (Part.first <= CurCost) { - CurPID = Idx; - CurCost = Part.first; - } - } - assert(CurPID != InvalidPID); - return CurPID; -} - -void SplitProposal::calculateScores() { - if (Partitions.empty()) - return; - - assert(SG); - CostType LargestPCost = 0; - for (auto &[PCost, Nodes] : Partitions) { - if (PCost > LargestPCost) - LargestPCost = PCost; + /// \returns true if this function and its dependencies can be considered + /// large according to \p Threshold. + bool isLarge(CostType Threshold) const { + return TotalCost > Threshold && !Dependencies.empty(); } - - CostType ModuleCost = SG->getModuleCost(); - CodeSizeScore = double(TotalCost) / ModuleCost; - assert(CodeSizeScore >= 0.0); - - BottleneckScore = double(LargestPCost) / ModuleCost; - - CodeSizeScore = std::ceil(CodeSizeScore * 100.0) / 100.0; - BottleneckScore = std::ceil(BottleneckScore * 100.0) / 100.0; -} - -#ifndef NDEBUG -void SplitProposal::verifyCompleteness() const { - if (Partitions.empty()) - return; - - BitVector Result = Partitions[0].second; - for (const auto &P : drop_begin(Partitions)) - Result |= P.second; - assert(Result.all() && "some nodes are missing from this proposal!"); -} -#endif - -//===-- RecursiveSearchStrategy -------------------------------------------===// - -/// Partitioning algorithm. -/// -/// This is a recursive search algorithm that can explore multiple possiblities. -/// -/// When a cluster of nodes can go into more than one partition, and we haven't -/// reached maximum search depth, we recurse and explore both options and their -/// consequences. Both branches will yield a proposal, and the driver will grade -/// both and choose the best one. -/// -/// If max depth is reached, we will use some heuristics to make a choice. Most -/// of the time we will just use the least-pressured (cheapest) partition, but -/// if a cluster is particularly big and there is a good amount of overlap with -/// an existing partition, we will choose that partition instead. -class RecursiveSearchSplitting { -public: - using SubmitProposalFn = function_ref; - - RecursiveSearchSplitting(const SplitGraph &SG, unsigned NumParts, - SubmitProposalFn SubmitProposal); - - void run(); - -private: - struct WorkListEntry { - WorkListEntry(const BitVector &BV) : Cluster(BV) {} - - unsigned NumNonEntryNodes = 0; - CostType TotalCost = 0; - CostType CostExcludingGraphEntryPoints = 0; - BitVector Cluster; - }; - - /// Collects all graph entry points's clusters and sort them so the most - /// expensive clusters are viewed first. This will merge clusters together if - /// they share a non-copyable dependency. - void setupWorkList(); - - /// Recursive function that assigns the worklist item at \p Idx into a - /// partition of \p SP. - /// - /// \p Depth is the current search depth. When this value is equal to - /// \ref MaxDepth, we can no longer recurse. - /// - /// This function only recurses if there is more than one possible assignment, - /// otherwise it is iterative to avoid creating a call stack that is as big as - /// \ref WorkList. - void pickPartition(unsigned Depth, unsigned Idx, SplitProposal SP); - - /// \return A pair: first element is the PID of the partition that has the - /// most similarities with \p Entry, or \ref InvalidPID if no partition was - /// found with at least one element in common. The second element is the - /// aggregated cost of all dependencies in common between \p Entry and that - /// partition. - std::pair - findMostSimilarPartition(const WorkListEntry &Entry, const SplitProposal &SP); - - const SplitGraph &SG; - unsigned NumParts; - SubmitProposalFn SubmitProposal; - - // A Cluster is considered large when its cost, excluding entry points, - // exceeds this value. - CostType LargeClusterThreshold = 0; - unsigned NumProposalsSubmitted = 0; - SmallVector WorkList; }; -RecursiveSearchSplitting::RecursiveSearchSplitting( - const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal) - : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) { - // arbitrary max value as a safeguard. Anything above 10 will already be - // slow, this is just a max value to prevent extreme resource exhaustion or - // unbounded run time. - if (MaxDepth > 16) - report_fatal_error("[amdgpu-split-module] search depth of " + - Twine(MaxDepth) + " is too high!"); - LargeClusterThreshold = - (LargeFnFactor != 0.0) - ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor)) - : std::numeric_limits::max(); - LLVM_DEBUG(dbgs() << "[recursive search] large cluster threshold set at " - << LargeClusterThreshold << "\n"); -} - -void RecursiveSearchSplitting::run() { - { - SplitModuleTimer SMT("recursive_search_prepare", "preparing worklist"); - setupWorkList(); +/// Calculates how much overlap there is between \p A and \p B. +/// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A +/// and B have no shared elements. Kernels do not count in overlap calculation. +static float calculateOverlap(const DenseSet &A, + const DenseSet &B) { + DenseSet Total; + for (const auto *F : A) { + if (!isEntryPoint(F)) + Total.insert(F); } - { - SplitModuleTimer SMT("recursive_search_pick", "partitioning"); - SplitProposal SP(SG, NumParts); - pickPartition(/*BranchDepth=*/0, /*Idx=*/0, SP); - } -} + if (Total.empty()) + return 0.0f; -void RecursiveSearchSplitting::setupWorkList() { - // e.g. if A and B are two worklist item, and they both call a non copyable - // dependency C, this does: - // A=C - // B=C - // => NodeEC will create a single group (A, B, C) and we create a new - // WorkList entry for that group. - - EquivalenceClasses NodeEC; - for (const SplitGraph::Node *N : SG.nodes()) { - if (!N->isGraphEntryPoint()) + unsigned NumCommon = 0; + for (const auto *F : B) { + if (isEntryPoint(F)) continue; - NodeEC.insert(N->getID()); - N->visitAllDependencies([&](const SplitGraph::Node &Dep) { - if (&Dep != N && Dep.isNonCopyable()) - NodeEC.unionSets(N->getID(), Dep.getID()); - }); + auto [It, Inserted] = Total.insert(F); + if (!Inserted) + ++NumCommon; } - for (auto I = NodeEC.begin(), E = NodeEC.end(); I != E; ++I) { - if (!I->isLeader()) - continue; + return static_cast(NumCommon) / Total.size(); +} - BitVector Cluster = SG.createNodesBitVector(); - for (auto MI = NodeEC.member_begin(I); MI != NodeEC.member_end(); ++MI) { - const SplitGraph::Node &N = SG.getNode(*MI); - if (N.isGraphEntryPoint()) - N.getDependencies(Cluster); - } - WorkList.emplace_back(std::move(Cluster)); - } +/// Performs all of the partitioning work on \p M. +/// \param SML Log Helper +/// \param M Module to partition. +/// \param NumParts Number of partitions to create. +/// \param ModuleCost Total cost of all functions in \p M. +/// \param FnCosts Map of Function -> Cost +/// \param WorkList Functions and their dependencies to process in order. +/// \returns The created partitions (a vector of size \p NumParts ) +static std::vector> +doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, + CostType ModuleCost, + const DenseMap &FnCosts, + const SmallVector &WorkList) { + + SML << "\n--Partitioning Starts--\n"; + + // Calculate a "large function threshold". When more than one function's total + // import cost exceeds this value, we will try to assign it to an existing + // partition to reduce the amount of duplication needed. + // + // e.g. let two functions X and Y have a import cost of ~10% of the module, we + // assign X to a partition as usual, but when we get to Y, we check if it's + // worth also putting it in Y's partition. + const CostType LargeFnThreshold = + LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor)) + : std::numeric_limits::max(); + + std::vector> Partitions; + Partitions.resize(NumParts); + + // Assign functions to partitions, and try to keep the partitions more or + // less balanced. We do that through a priority queue sorted in reverse, so we + // can always look at the partition with the least content. + // + // There are some cases where we will be deliberately unbalanced though. + // - Large functions: we try to merge with existing partitions to reduce code + // duplication. + // - Functions with indirect or external calls always go in the first + // partition (P0). + auto ComparePartitions = [](const std::pair &a, + const std::pair &b) { + // When two partitions have the same cost, assign to the one with the + // biggest ID first. This allows us to put things in P0 last, because P0 may + // have other stuff added later. + if (a.second == b.second) + return a.first < b.first; + return a.second > b.second; + }; - // Calculate costs and other useful information. - for (WorkListEntry &Entry : WorkList) { - for (unsigned NodeID : Entry.Cluster.set_bits()) { - const SplitGraph::Node &N = SG.getNode(NodeID); - const CostType Cost = N.getIndividualCost(); + // We can't use priority_queue here because we need to be able to access any + // element. This makes this a bit inefficient as we need to sort it again + // everytime we change it, but it's a very small array anyway (likely under 64 + // partitions) so it's a cheap operation. + std::vector> BalancingQueue; + for (unsigned I = 0; I < NumParts; ++I) + BalancingQueue.emplace_back(I, 0); + + // Helper function to handle assigning a function to a partition. This takes + // care of updating the balancing queue. + const auto AssignToPartition = [&](PartitionID PID, + const FunctionWithDependencies &FWD) { + auto &FnsInPart = Partitions[PID]; + FnsInPart.insert(FWD.Fn); + FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end()); + + SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n -> "; + if (!FWD.Dependencies.empty()) { + SML << FWD.Dependencies.size() << " dependencies added\n"; + }; + + // Update the balancing queue. we scan backwards because in the common case + // the partition is at the end. + for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) { + if (QueuePID == PID) { + CostType NewCost = 0; + for (auto *Fn : Partitions[PID]) + NewCost += FnCosts.at(Fn); + + SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost; + if (Cost) { + SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100) + << "% increase)"; + } + SML << '\n'; - Entry.TotalCost += Cost; - if (!N.isGraphEntryPoint()) { - Entry.CostExcludingGraphEntryPoints += Cost; - ++Entry.NumNonEntryNodes; + Cost = NewCost; } } - } - sort(WorkList, [](const WorkListEntry &LHS, const WorkListEntry &RHS) { - return LHS.TotalCost > RHS.TotalCost; - }); - - LLVM_DEBUG({ - dbgs() << "[recursive search] worklist:\n"; - for (const auto &[Idx, Entry] : enumerate(WorkList)) { - dbgs() << " - [" << Idx << "]: "; - for (unsigned NodeID : Entry.Cluster.set_bits()) - dbgs() << NodeID << " "; - dbgs() << "(total_cost:" << Entry.TotalCost - << ", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints - << ")\n"; - } - }); -} + sort(BalancingQueue, ComparePartitions); + }; -void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx, - SplitProposal SP) { - while (Idx < WorkList.size()) { - // Step 1: Determine candidate PIDs. - // - const WorkListEntry &Entry = WorkList[Idx]; - const BitVector &Cluster = Entry.Cluster; - - // Default option is to do load-balancing, AKA assign to least pressured - // partition. - const unsigned CheapestPID = SP.findCheapestPartition(); - assert(CheapestPID != InvalidPID); - - // Explore assigning to the kernel that contains the most dependencies in - // common. - const auto [MostSimilarPID, SimilarDepsCost] = - findMostSimilarPartition(Entry, SP); - - // We can chose to explore only one path if we only have one valid path, or - // if we reached maximum search depth and can no longer branch out. - unsigned SinglePIDToTry = InvalidPID; - if (MostSimilarPID == InvalidPID) // no similar PID found - SinglePIDToTry = CheapestPID; - else if (MostSimilarPID == CheapestPID) // both landed on the same PID - SinglePIDToTry = CheapestPID; - else if (Depth >= MaxDepth) { - // We have to choose one path. Use a heuristic to guess which one will be - // more appropriate. - if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) { - // Check if the amount of code in common makes it worth it. - assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints); - const double Ratio = - SimilarDepsCost / Entry.CostExcludingGraphEntryPoints; - assert(Ratio >= 0.0 && Ratio <= 1.0); - if (LargeFnOverlapForMerge > Ratio) { - // For debug, just print "L", so we'll see "L3=P3" for instance, which - // will mean we reached max depth and chose P3 based on this - // heuristic. - LLVM_DEBUG(dbgs() << 'L'); - SinglePIDToTry = MostSimilarPID; - } - } else - SinglePIDToTry = CheapestPID; + for (auto &CurFn : WorkList) { + // When a function has indirect calls, it must stay in the first partition + // alongside every reachable non-entry function. This is a nightmare case + // for splitting as it severely limits what we can do. + if (CurFn.HasIndirectCall) { + SML << "Function with indirect call(s): " << getName(*CurFn.Fn) + << " defaulting to P0\n"; + AssignToPartition(0, CurFn); + continue; } - // Step 2: Explore candidates. - - // When we only explore one possible path, and thus branch depth doesn't - // increase, do not recurse, iterate instead. - if (SinglePIDToTry != InvalidPID) { - LLVM_DEBUG(dbgs() << Idx << "=P" << SinglePIDToTry << ' '); - // Only one path to explore, don't clone SP, don't increase depth. - SP.add(SinglePIDToTry, Cluster); - ++Idx; + // When a function has non duplicatable dependencies, we have to keep it in + // the first partition as well. This is a conservative approach, a + // finer-grained approach could keep track of which dependencies are + // non-duplicatable exactly and just make sure they're grouped together. + if (CurFn.HasNonDuplicatableDependecy) { + SML << "Function with externally visible dependency " + << getName(*CurFn.Fn) << " defaulting to P0\n"; + AssignToPartition(0, CurFn); continue; } - assert(MostSimilarPID != InvalidPID); - - // We explore multiple paths: recurse at increased depth, then stop this - // function. - - LLVM_DEBUG(dbgs() << '\n'); - - // lb = load balancing = put in cheapest partition - { - SplitProposal BranchSP = SP; - LLVM_DEBUG(dbgs().indent(Depth) - << " [lb] " << Idx << "=P" << CheapestPID << "? "); - BranchSP.add(CheapestPID, Cluster); - pickPartition(Depth + 1, Idx + 1, BranchSP); - } + // Be smart with large functions to avoid duplicating their dependencies. + if (CurFn.isLarge(LargeFnThreshold)) { + assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f); + SML << "Large Function: " << getName(*CurFn.Fn) + << " - looking for partition with at least " + << format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n"; + + bool Assigned = false; + for (const auto &[PID, Fns] : enumerate(Partitions)) { + float Overlap = calculateOverlap(CurFn.Dependencies, Fns); + SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P" + << PID << '\n'; + if (Overlap > LargeFnOverlapForMerge) { + SML << " selecting P" << PID << '\n'; + AssignToPartition(PID, CurFn); + Assigned = true; + } + } - // ms = most similar = put in partition with the most in common - { - SplitProposal BranchSP = SP; - LLVM_DEBUG(dbgs().indent(Depth) - << " [ms] " << Idx << "=P" << MostSimilarPID << "? "); - BranchSP.add(MostSimilarPID, Cluster); - pickPartition(Depth + 1, Idx + 1, BranchSP); + if (Assigned) + continue; } - return; + // Normal "load-balancing", assign to partition with least pressure. + auto [PID, CurCost] = BalancingQueue.back(); + AssignToPartition(PID, CurFn); } - // Step 3: If we assigned all WorkList items, submit the proposal. - - assert(Idx == WorkList.size()); - assert(NumProposalsSubmitted <= (2u << MaxDepth) && - "Search got out of bounds?"); - SP.setName("recursive_search (depth=" + std::to_string(Depth) + ") #" + - std::to_string(NumProposalsSubmitted++)); - LLVM_DEBUG(dbgs() << '\n'); - SubmitProposal(SP); -} - -std::pair -RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry, - const SplitProposal &SP) { - if (!Entry.NumNonEntryNodes) - return {InvalidPID, 0}; - - // We take the partition that is the most similar using Cost as a metric. - // So we take the set of nodes in common, compute their aggregated cost, and - // pick the partition with the highest cost in common. - unsigned ChosenPID = InvalidPID; - CostType ChosenCost = 0; - for (unsigned PID = 0; PID < NumParts; ++PID) { - BitVector BV = SP[PID]; - BV &= Entry.Cluster; // FIXME: & doesn't work between BVs?! - - if (BV.none()) - continue; - - const CostType Cost = SG.calculateCost(BV); - - if (ChosenPID == InvalidPID || ChosenCost < Cost || - (ChosenCost == Cost && PID > ChosenPID)) { - ChosenPID = PID; - ChosenCost = Cost; + if (SML) { + CostType ModuleCostOr1 = ModuleCost ? ModuleCost : 1; + for (const auto &[Idx, Part] : enumerate(Partitions)) { + CostType Cost = 0; + for (auto *Fn : Part) + Cost += FnCosts.at(Fn); + SML << "P" << Idx << " has a total cost of " << Cost << " (" + << format("%0.2f", (float(Cost) / ModuleCostOr1) * 100) + << "% of source module)\n"; } - } - - return {ChosenPID, ChosenCost}; -} -//===----------------------------------------------------------------------===// -// DOTGraph Printing Support -//===----------------------------------------------------------------------===// - -const SplitGraph::Node *mapEdgeToDst(const SplitGraph::Edge *E) { - return E->Dst; -} - -using SplitGraphEdgeDstIterator = - mapped_iterator; - -} // namespace - -template <> struct GraphTraits { - using NodeRef = const SplitGraph::Node *; - using nodes_iterator = SplitGraph::nodes_iterator; - using ChildIteratorType = SplitGraphEdgeDstIterator; - - using EdgeRef = const SplitGraph::Edge *; - using ChildEdgeIteratorType = SplitGraph::edges_iterator; - - static NodeRef getEntryNode(NodeRef N) { return N; } - - static ChildIteratorType child_begin(NodeRef Ref) { - return {Ref->outgoing_edges().begin(), mapEdgeToDst}; - } - static ChildIteratorType child_end(NodeRef Ref) { - return {Ref->outgoing_edges().end(), mapEdgeToDst}; - } - - static nodes_iterator nodes_begin(const SplitGraph &G) { - return G.nodes().begin(); - } - static nodes_iterator nodes_end(const SplitGraph &G) { - return G.nodes().end(); - } -}; - -template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { - DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} - - static std::string getGraphName(const SplitGraph &SG) { - return SG.getModule().getName().str(); - } - - std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG) { - return N->getName().str(); - } - - static std::string getNodeDescription(const SplitGraph::Node *N, - const SplitGraph &SG) { - std::string Result; - if (N->isEntryFunctionCC()) - Result += "entry-fn-cc "; - if (N->isNonCopyable()) - Result += "non-copyable "; - Result += "cost:" + std::to_string(N->getIndividualCost()); - return Result; - } - - static std::string getNodeAttributes(const SplitGraph::Node *N, - const SplitGraph &SG) { - return N->hasAnyIncomingEdges() ? "" : "color=\"red\""; + SML << "--Partitioning Done--\n\n"; } - static std::string getEdgeAttributes(const SplitGraph::Node *N, - SplitGraphEdgeDstIterator EI, - const SplitGraph &SG) { + // Check no functions were missed. +#ifndef NDEBUG + DenseSet AllFunctions; + for (const auto &Part : Partitions) + AllFunctions.insert(Part.begin(), Part.end()); - switch ((*EI.getCurrent())->Kind) { - case SplitGraph::EdgeKind::DirectCall: - return ""; - case SplitGraph::EdgeKind::IndirectCall: - return "style=\"dashed\""; + for (auto &Fn : M) { + if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) { + assert(AllFunctions.contains(&Fn) && "Missed a function?!"); } - llvm_unreachable("Unknown SplitGraph::EdgeKind enum"); } -}; - -//===----------------------------------------------------------------------===// -// Driver -//===----------------------------------------------------------------------===// - -namespace { +#endif -// If we didn't externalize GVs, then local GVs need to be conservatively -// imported into every module (including their initializers), and then cleaned -// up afterwards. -static bool needsConservativeImport(const GlobalValue *GV) { - if (const auto *Var = dyn_cast(GV)) - return Var->hasLocalLinkage(); - return isa(GV); + return Partitions; } -/// Prints a summary of the partition \p N, represented by module \p M, to \p -/// OS. -static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M, - unsigned PartCost, unsigned ModuleCost) { - OS << "*** Partition P" << N << " ***\n"; - - for (const auto &Fn : M) { - if (!Fn.isDeclaration()) - OS << " - [function] " << Fn.getName() << "\n"; - } - - for (const auto &GV : M.globals()) { - if (GV.hasInitializer()) - OS << " - [global] " << GV.getName() << "\n"; +static void externalize(GlobalValue &GV) { + if (GV.hasLocalLinkage()) { + GV.setLinkage(GlobalValue::ExternalLinkage); + GV.setVisibility(GlobalValue::HiddenVisibility); } - OS << "Partition contains " << formatRatioOf(PartCost, ModuleCost) - << "% of the source\n"; -} - -static void evaluateProposal(SplitProposal &Best, SplitProposal New) { - SplitModuleTimer SMT("proposal_evaluation", "proposal ranking algorithm"); - - New.calculateScores(); - - LLVM_DEBUG({ - New.verifyCompleteness(); - if (DebugProposalSearch) - New.print(dbgs()); - }); - - const double CurBScore = Best.getBottleneckScore(); - const double CurCSScore = Best.getCodeSizeScore(); - const double NewBScore = New.getBottleneckScore(); - const double NewCSScore = New.getCodeSizeScore(); - - // TODO: Improve this - // We can probably lower the precision of the comparison at first - // e.g. if we have - // - (Current): BScore: 0.489 CSCore 1.105 - // - (New): BScore: 0.475 CSCore 1.305 - // Currently we'd choose the new one because the bottleneck score is - // lower, but the new one duplicates more code. It may be worth it to - // discard the new proposal as the impact on build time is negligible. - - // Compare them - bool IsBest = false; - if (NewBScore < CurBScore) - IsBest = true; - else if (NewBScore == CurBScore) - IsBest = (NewCSScore < CurCSScore); // Use code size as tie breaker. - - if (IsBest) - Best = std::move(New); - - LLVM_DEBUG(if (DebugProposalSearch) { - if (IsBest) - dbgs() << "[search] new best proposal!\n"; - else - dbgs() << "[search] discarding - not profitable\n"; - }); -} - -/// Trivial helper to create an identical copy of \p M. -static std::unique_ptr cloneAll(const Module &M) { - ValueToValueMapTy VMap; - return CloneModule(M, VMap, [&](const GlobalValue *GV) { return true; }); + // Unnamed entities must be named consistently between modules. setName will + // give a distinct name to each such entity. + if (!GV.hasName()) + GV.setName("__llvmsplit_unnamed"); } -/// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested. -static void writeDOTGraph(const SplitGraph &SG) { - if (ModuleDotCfgOutput.empty()) - return; - - std::error_code EC; - raw_fd_ostream OS(ModuleDotCfgOutput, EC); - if (EC) { - errs() << "[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput - << "' - DOTGraph will not be printed\n"; +static bool hasDirectCaller(const Function &Fn) { + for (auto &U : Fn.uses()) { + if (auto *CB = dyn_cast(U.getUser()); CB && CB->isCallee(&U)) + return true; } - WriteGraph(OS, SG, /*ShortName=*/false, - /*Title=*/SG.getModule().getName()); + return false; } static void splitAMDGPUModule( - GetTTIFn GetTTI, Module &M, unsigned NumParts, + GetTTIFn GetTTI, Module &M, unsigned N, function_ref MPart)> ModuleCallback) { + + SplitModuleLogger SML(M); + CallGraph CG(M); // Externalize functions whose address are taken. @@ -1341,8 +639,8 @@ static void splitAMDGPUModule( for (auto &Fn : M) { if (Fn.hasAddressTaken()) { if (Fn.hasLocalLinkage()) { - LLVM_DEBUG(dbgs() << "[externalize] " << Fn.getName() - << " because its address is taken\n"); + SML << "[externalize] " << Fn.getName() + << " because its address is taken\n"; } externalize(Fn); } @@ -1353,179 +651,138 @@ static void splitAMDGPUModule( if (!NoExternalizeGlobals) { for (auto &GV : M.globals()) { if (GV.hasLocalLinkage()) - LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << '\n'); + SML << "[externalize] GV " << GV.getName() << '\n'; externalize(GV); } } // Start by calculating the cost of every function in the module, as well as // the module's overall cost. - FunctionsCostMap FnCosts; - const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts); - - // Build the SplitGraph, which represents the module's functions and models - // their dependencies accurately. - SplitGraph SG(M, FnCosts, ModuleCost); - SG.buildGraph(CG); - - if (SG.empty()) { - LLVM_DEBUG( - dbgs() - << "[!] no nodes in graph, input is empty - no splitting possible\n"); - ModuleCallback(cloneAll(M)); - return; + DenseMap FnCosts; + const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts); + + // First, gather ever kernel into the worklist. + SmallVector WorkList; + for (auto &Fn : M) { + if (isEntryPoint(&Fn) && !Fn.isDeclaration()) + WorkList.emplace_back(SML, CG, FnCosts, &Fn); } - LLVM_DEBUG({ - dbgs() << "[graph] nodes:\n"; - for (const SplitGraph::Node *N : SG.nodes()) { - dbgs() << " - [" << N->getID() << "]: " << N->getName() << " " - << (N->isGraphEntryPoint() ? "(entry)" : "") << "\n"; + // Then, find missing functions that need to be considered as additional + // roots. These can't be called in theory, but in practice we still have to + // handle them to avoid linker errors. + { + DenseSet SeenFunctions; + for (const auto &FWD : WorkList) { + SeenFunctions.insert(FWD.Fn); + SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end()); } - }); - writeDOTGraph(SG); - - LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n"); - - std::optional Proposal; - const auto EvaluateProposal = [&](SplitProposal SP) { - if (!Proposal) - Proposal = std::move(SP); - else - evaluateProposal(*Proposal, std::move(SP)); - }; - - // TODO: It would be very easy to create new strategies by just adding a base - // class to RecursiveSearchSplitting and abstracting it away. - RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run(); - LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: " - << Proposal->getName() << "\n";); - - if (!Proposal) { - LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n"); - ModuleCallback(cloneAll(M)); - return; + for (auto &Fn : M) { + // If this function is not part of any kernel's dependencies and isn't + // directly called, consider it as a root. + if (!Fn.isDeclaration() && !isEntryPoint(&Fn) && + !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) { + WorkList.emplace_back(SML, CG, FnCosts, &Fn); + } + } } - LLVM_DEBUG(Proposal->print(dbgs());); + // Sort the worklist so the most expensive roots are seen first. + sort(WorkList, [&](auto &A, auto &B) { + // Sort by total cost, and if the total cost is identical, sort + // alphabetically. + if (A.TotalCost == B.TotalCost) + return A.Fn->getName() < B.Fn->getName(); + return A.TotalCost > B.TotalCost; + }); - std::optional SummariesOS; - if (!PartitionSummariesOutput.empty()) { - std::error_code EC; - SummariesOS.emplace(PartitionSummariesOutput, EC); - if (EC) - errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput - << "' - Partition summaries will not be printed\n"; + if (SML) { + SML << "Worklist\n"; + for (const auto &FWD : WorkList) { + SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost + << " indirect:" << FWD.HasIndirectCall + << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy + << ")\n"; + // Sort function names before printing to ensure determinism. + SmallVector SortedDepNames; + SortedDepNames.reserve(FWD.Dependencies.size()); + for (const auto *Dep : FWD.Dependencies) + SortedDepNames.push_back(getName(*Dep)); + sort(SortedDepNames); + + for (const auto &Name : SortedDepNames) + SML << " [dependency] " << Name << '\n'; + } } - for (unsigned PID = 0; PID < NumParts; ++PID) { - SplitModuleTimer SMT2("modules_creation", - "creating modules for each partition"); - LLVM_DEBUG(dbgs() << "[split] creating new modules\n"); + // This performs all of the partitioning work. + auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList); + assert(Partitions.size() == N); + + // If we didn't externalize GVs, then local GVs need to be conservatively + // imported into every module (including their initializers), and then cleaned + // up afterwards. + const auto NeedsConservativeImport = [&](const GlobalValue *GV) { + // We conservatively import private/internal GVs into every module and clean + // them up afterwards. + const auto *Var = dyn_cast(GV); + return Var && Var->hasLocalLinkage(); + }; - DenseSet FnsInPart; - for (unsigned NodeID : (*Proposal)[PID].set_bits()) - FnsInPart.insert(&SG.getNode(NodeID).getFunction()); + SML << "Creating " << N << " modules...\n"; + unsigned TotalFnImpls = 0; + for (unsigned I = 0; I < N; ++I) { + const auto &FnsInPart = Partitions[I]; ValueToValueMapTy VMap; - CostType PartCost = 0; std::unique_ptr MPart( CloneModule(M, VMap, [&](const GlobalValue *GV) { // Functions go in their assigned partition. - if (const auto *Fn = dyn_cast(GV)) { - if (FnsInPart.contains(Fn)) { - PartCost += SG.getCost(*Fn); - return true; - } - return false; - } + if (const auto *Fn = dyn_cast(GV)) + return FnsInPart.contains(Fn); + + if (NeedsConservativeImport(GV)) + return true; // Everything else goes in the first partition. - return needsConservativeImport(GV) || PID == 0; + return I == 0; })); - // FIXME: Aliases aren't seen often, and their handling isn't perfect so - // bugs are possible. - // Clean-up conservatively imported GVs without any users. - for (auto &GV : make_early_inc_range(MPart->global_values())) { - if (needsConservativeImport(&GV) && GV.use_empty()) + for (auto &GV : make_early_inc_range(MPart->globals())) { + if (NeedsConservativeImport(&GV) && GV.use_empty()) GV.eraseFromParent(); } - if (SummariesOS) - printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost); - - LLVM_DEBUG( - printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost)); - + unsigned NumAllFns = 0, NumKernels = 0; + for (auto &Cur : *MPart) { + if (!Cur.isDeclaration()) { + ++NumAllFns; + if (isEntryPoint(&Cur)) + ++NumKernels; + } + } + TotalFnImpls += NumAllFns; + SML << " - Module " << I << " with " << NumAllFns << " functions (" + << NumKernels << " kernels)\n"; ModuleCallback(std::move(MPart)); } + + SML << TotalFnImpls << " function definitions across all modules (" + << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100) + << "% of original module)\n"; } } // namespace PreservedAnalyses AMDGPUSplitModulePass::run(Module &M, ModuleAnalysisManager &MAM) { - SplitModuleTimer SMT( - "total", "total pass runtime (incl. potentially waiting for lockfile)"); - FunctionAnalysisManager &FAM = MAM.getResult(M).getManager(); const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & { return FAM.getResult(F); }; - - bool Done = false; -#ifndef NDEBUG - if (UseLockFile) { - SmallString<128> LockFilePath; - sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath); - sys::path::append(LockFilePath, "amdgpu-split-module-debug"); - LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath - << "'\n"); - - while (true) { - llvm::LockFileManager Locked(LockFilePath.str()); - switch (Locked) { - case LockFileManager::LFS_Error: - LLVM_DEBUG( - dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug " - "output may be mangled by other processes\n"); - Locked.unsafeRemoveLockFile(); - break; - case LockFileManager::LFS_Owned: - break; - case LockFileManager::LFS_Shared: { - switch (Locked.waitForUnlock()) { - case LockFileManager::Res_Success: - break; - case LockFileManager::Res_OwnerDied: - continue; // try again to get the lock. - case LockFileManager::Res_Timeout: - LLVM_DEBUG( - dbgs() - << "[amdgpu-split-module] unable to acquire lockfile, debug " - "output may be mangled by other processes\n"); - Locked.unsafeRemoveLockFile(); - break; // give up - } - break; - } - } - - splitAMDGPUModule(TTIGetter, M, N, ModuleCallback); - Done = true; - break; - } - } -#endif - - if (!Done) - splitAMDGPUModule(TTIGetter, M, N, ModuleCallback); - - // We can change linkage/visibilities in the input, consider that nothing is - // preserved just to be safe. This pass runs last anyway. - return PreservedAnalyses::none(); + splitAMDGPUModule(TTIGetter, M, N, ModuleCallback); + // We don't change the original module. + return PreservedAnalyses::all(); } -} // namespace llvm diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll index 708b5a006be60e..d269f92763853c 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll @@ -1,24 +1,30 @@ -; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0 -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0 +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s ; 3 kernels: ; - A does a direct call to HelperA ; - B is storing @HelperA ; - C does a direct call to HelperA ; -; The helper functions will get externalized, so C/A will end up -; in the same partition. - -; P0 is empty. -; CHECK0: declare - -; CHECK1: define amdgpu_kernel void @B(ptr %dst) - -; CHECK2: define hidden void @HelperA() -; CHECK2: define amdgpu_kernel void @A() -; CHECK2: define amdgpu_kernel void @C() +; The helper functions will get externalized, which will force A and C into P0 as +; external functions cannot be duplicated. + +; CHECK0: define hidden void @HelperA() +; CHECK0: define amdgpu_kernel void @A() +; CHECK0: declare amdgpu_kernel void @B(ptr) +; CHECK0: define amdgpu_kernel void @C() + +; CHECK1: declare hidden void @HelperA() +; CHECK1: declare amdgpu_kernel void @A() +; CHECK1: declare amdgpu_kernel void @B(ptr) +; CHECK1: declare amdgpu_kernel void @C() + +; CHECK2: declare hidden void @HelperA() +; CHECK2: declare amdgpu_kernel void @A() +; CHECK2: define amdgpu_kernel void @B(ptr %dst) +; CHECK2: declare amdgpu_kernel void @C() define internal void @HelperA() { ret void diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll index 81f6c8f0fbb3a6..731cf4b374c95b 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll @@ -1,4 +1,4 @@ -; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0 +; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s diff --git a/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll new file mode 100644 index 00000000000000..6a07ed51ba1beb --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll @@ -0,0 +1,20 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -debug -amdgpu-module-splitting-log-private 2>&1 | FileCheck %s --implicit-check-not=MyCustomKernel +; REQUIRES: asserts + +; SHA256 of the kernel names. + +; CHECK: a097723d21cf9f35d90e6fb7881995ac8c398b3366a6c97efc657404f9fe301c +; CHECK: 626bc23242de8fcfda7f0e66318d29455c081df6b5380e64d14703c95fcbcd59 +; CHECK: c38d90a7ca71dc5d694bb9e093dadcdedfc4cb4adf7ed7e46d42fe95a0b4ef55 + +define amdgpu_kernel void @MyCustomKernel0() { + ret void +} + +define amdgpu_kernel void @MyCustomKernel1() { + ret void +} + +define amdgpu_kernel void @MyCustomKernel2() { + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll new file mode 100644 index 00000000000000..836b5c05d0653d --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll @@ -0,0 +1,36 @@ +; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -debug 2>&1 | FileCheck %s --implicit-check-not="[root]" +; REQUIRES: asserts + +; func_3 is never directly called, it needs to be considered +; as a root to handle this module correctly. + +; CHECK: [root] kernel_1 +; CHECK-NEXT: [dependency] func_1 +; CHECK-NEXT: [dependency] func_2 +; CHECK-NEXT: [root] func_3 +; CHECK-NEXT: [dependency] func_2 + +define amdgpu_kernel void @kernel_1() { +entry: + call void @func_1() + ret void +} + +define linkonce_odr hidden void @func_1() { +entry: + %call = call i32 @func_2() + ret void +} + +define linkonce_odr hidden i32 @func_2() #0 { +entry: + ret i32 0 +} + +define void @func_3() { +entry: + %call = call i32 @func_2() + ret void +} + +attributes #0 = { noinline optnone } diff --git a/llvm/test/tools/llvm-split/AMDGPU/declarations.ll b/llvm/test/tools/llvm-split/AMDGPU/declarations.ll index 755676061b2557..10b6cdfef4055f 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/declarations.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/declarations.ll @@ -1,13 +1,16 @@ ; RUN: rm -rf %t0 %t1 ; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s -; RUN: not llvm-dis -o - %t1 +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s -; Empty module without any defs should result in a single output module that is -; an exact copy of the input. +; Check that all declarations are put into each partition. ; CHECK0: declare void @A ; CHECK0: declare void @B +; CHECK1: declare void @A +; CHECK1: declare void @B + declare void @A() + declare void @B() diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll index d7e84abd5f968d..c2746d1398924c 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll @@ -1,6 +1,6 @@ ; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s ; 3 kernels: ; - A calls nothing @@ -13,12 +13,16 @@ ; Additionally, @PerryThePlatypus gets externalized as ; the alias counts as taking its address. -; CHECK0: define amdgpu_kernel void @A +; CHECK0-NOT: define +; CHECK0: @Perry = internal alias ptr (), ptr @PerryThePlatypus +; CHECK0: define hidden void @PerryThePlatypus() +; CHECK0: define amdgpu_kernel void @B +; CHECK0: define amdgpu_kernel void @C +; CHECK0-NOT: define -; CHECK1: @Perry = internal alias ptr (), ptr @PerryThePlatypus -; CHECK1: define hidden void @PerryThePlatypus() -; CHECK1: define amdgpu_kernel void @B -; CHECK1: define amdgpu_kernel void @C +; CHECK1-NOT: define +; CHECK1: define amdgpu_kernel void @A +; CHECK1-NOT: define @Perry = internal alias ptr(), ptr @PerryThePlatypus diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll index c7e13304dc6dec..4635264aefb39a 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll @@ -1,21 +1,27 @@ ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s ; 3 kernels with each their own dependencies should go into 3 ; distinct partitions. The most expensive kernel should be ; seen first and go into the last partition. +; CHECK0-NOT: define ; CHECK0: define amdgpu_kernel void @C ; CHECK0: define internal void @HelperC ; CHECK0-NOT: define +; CHECK1-NOT: define ; CHECK1: define amdgpu_kernel void @A ; CHECK1: define internal void @HelperA +; CHECK1-NOT: define +; CHECK2-NOT: define ; CHECK2: define amdgpu_kernel void @B ; CHECK2: define internal void @HelperB +; CHECK2-NOT: define + define amdgpu_kernel void @A() { call void @HelperA() diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll index 332344a776e82e..435e97a5813400 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll @@ -1,20 +1,29 @@ ; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 --implicit-check-not=define %s +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s +; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s -; CHECK0: define internal void @PrivateHelper1() -; CHECK0: define amdgpu_kernel void @D +; Both overridable helper should go in P0. -; CHECK1: define internal void @PrivateHelper0() -; CHECK1: define amdgpu_kernel void @C +; CHECK0-NOT: define +; CHECK0: define available_externally void @OverridableHelper0() +; CHECK0: define internal void @OverridableHelper1() +; CHECK0: define amdgpu_kernel void @A +; CHECK0: define amdgpu_kernel void @B +; CHECK0-NOT: define -; CHECK2: define internal void @OverridableHelper1() -; CHECK2: define amdgpu_kernel void @B +; CHECK1-NOT: define -; CHECK3: define available_externally void @OverridableHelper0() -; CHECK3: define amdgpu_kernel void @A +; CHECK2-NOT: define +; CHECK2: define internal void @PrivateHelper1() +; CHECK2: define amdgpu_kernel void @D +; CHECK2-NOT: define + +; CHECK3-NOT: define +; CHECK3: define internal void @PrivateHelper0() +; CHECK3: define amdgpu_kernel void @C +; CHECK3-NOT: define define available_externally void @OverridableHelper0() { ret void diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll index 5be945bda48bf4..2d870039112cbf 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll @@ -1,7 +1,7 @@ ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s ; We have 4 kernels: ; - Each kernel has an internal helper @@ -15,19 +15,25 @@ ; indirect call. HelperC/D should also end up in P0 as they ; are dependencies of HelperB. +; CHECK0-NOT: define +; CHECK0: define hidden void @HelperA +; CHECK0: define hidden void @HelperB +; CHECK0: define hidden void @CallCandidate +; CHECK0: define internal void @HelperC ; CHECK0: define internal void @HelperD -; CHECK0: define amdgpu_kernel void @D +; CHECK0: define amdgpu_kernel void @A +; CHECK0: define amdgpu_kernel void @B +; CHECK0-NOT: define -; CHECK1: define internal void @HelperC -; CHECK1: define amdgpu_kernel void @C +; CHECK1-NOT: define +; CHECK1: define internal void @HelperD +; CHECK1: define amdgpu_kernel void @D +; CHECK1-NOT: define -; CHECK2: define hidden void @HelperA -; CHECK2: define hidden void @HelperB -; CHECK2: define hidden void @CallCandidate +; CHECK2-NOT: define ; CHECK2: define internal void @HelperC -; CHECK2: define internal void @HelperD -; CHECK2: define amdgpu_kernel void @A -; CHECK2: define amdgpu_kernel void @B +; CHECK2: define amdgpu_kernel void @C +; CHECK2-NOT: define @addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate] diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll index 9205a5d1930e52..dc2c5c3c07bee6 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll @@ -1,15 +1,21 @@ ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s - -; CHECK0: define amdgpu_kernel void @D - -; CHECK1: define amdgpu_kernel void @C - -; CHECK2: define void @ExternalHelper -; CHECK2: define amdgpu_kernel void @A -; CHECK2: define amdgpu_kernel void @B +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; CHECK0-NOT: define +; CHECK0: define void @ExternalHelper +; CHECK0: define amdgpu_kernel void @A +; CHECK0: define amdgpu_kernel void @B +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define amdgpu_kernel void @D +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define amdgpu_kernel void @C +; CHECK2-NOT: define define void @ExternalHelper() { ret void diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll index a184d92aea9b9f..0fc76934afc548 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll @@ -1,20 +1,26 @@ ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-globals -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s ; 3 kernels use private/internal global variables. ; The GVs should be copied in each partition as needed. +; CHECK0-NOT: define ; CHECK0: @bar = internal constant ptr ; CHECK0: define amdgpu_kernel void @C +; CHECK0-NOT: define +; CHECK1-NOT: define ; CHECK1: @foo = private constant ptr ; CHECK1: define amdgpu_kernel void @A +; CHECK1-NOT: define +; CHECK2-NOT: define ; CHECK2: @foo = private constant ptr ; CHECK2: @bar = internal constant ptr ; CHECK2: define amdgpu_kernel void @B +; CHECK2-NOT: define @foo = private constant ptr poison @bar = internal constant ptr poison diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll index be84a0b5916f0d..7564662e7c7c0c 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll @@ -1,22 +1,28 @@ ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s ; 3 kernels use private/internal global variables. ; The GVs should be copied in each partition as needed. +; CHECK0-NOT: define ; CHECK0: @foo = hidden constant ptr poison ; CHECK0: @bar = hidden constant ptr poison ; CHECK0: define amdgpu_kernel void @C +; CHECK0-NOT: define +; CHECK1-NOT: define ; CHECK1: @foo = external hidden constant ptr{{$}} ; CHECK1: @bar = external hidden constant ptr{{$}} ; CHECK1: define amdgpu_kernel void @A +; CHECK1-NOT: define +; CHECK2-NOT: define ; CHECK2: @foo = external hidden constant ptr{{$}} ; CHECK2: @bar = external hidden constant ptr{{$}} ; CHECK2: define amdgpu_kernel void @B +; CHECK2-NOT: define @foo = private constant ptr poison @bar = internal constant ptr poison diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll index 807fb2e5f33cea..459c5a7f1a2db3 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll @@ -1,12 +1,12 @@ -; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=0 -amdgpu-module-splitting-large-threshold=1.2 -amdgpu-module-splitting-merge-threshold=0.5 -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=1.2 -amdgpu-module-splitting-large-function-merge-overlap=0.5 +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s -; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0 -amdgpu-module-splitting-max-depth=0 -; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 --implicit-check-not=define %s +; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0 +; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 %s +; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 %s +; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 %s ; 2 kernels (A/B) are large and share all their dependencies. ; They should go in the same partition, the remaining kernel should @@ -15,12 +15,14 @@ ; Also check w/o large kernels processing to verify they are indeed handled ; differently. -; P0 is empty -; CHECK0: declare +; CHECK0-NOT: define +; CHECK1-NOT: define ; CHECK1: define internal void @HelperC() ; CHECK1: define amdgpu_kernel void @C +; CHECK1-NOT: define +; CHECK2-NOT: define ; CHECK2: define internal void @large2() ; CHECK2: define internal void @large1() ; CHECK2: define internal void @large0() @@ -28,9 +30,12 @@ ; CHECK2: define internal void @HelperB() ; CHECK2: define amdgpu_kernel void @A ; CHECK2: define amdgpu_kernel void @B +; CHECK2-NOT: define +; NOLARGEKERNELS-CHECK0-NOT: define ; NOLARGEKERNELS-CHECK0: define internal void @HelperC() ; NOLARGEKERNELS-CHECK0: define amdgpu_kernel void @C +; NOLARGEKERNELS-CHECK0-NOT: define ; NOLARGEKERNELS-CHECK1: define internal void @large2() ; NOLARGEKERNELS-CHECK1: define internal void @large1() @@ -44,7 +49,6 @@ ; NOLARGEKERNELS-CHECK2: define internal void @HelperA() ; NOLARGEKERNELS-CHECK2: define amdgpu_kernel void @A - define internal void @large2() { store volatile i32 42, ptr null call void @large2() diff --git a/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll index 1314a78b42f3b0..167930ce0e8063 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll @@ -1,7 +1,7 @@ ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=DEFINE %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=DEFINE %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=DEFINE %s ; We have 4 function: ; - Each function has an internal helper @@ -11,19 +11,19 @@ ; @CallCandidate doesn't have to be in A/B's partition, unlike ; in the corresponding tests for kernels where it has to. +; CHECK0: define hidden void @HelperA +; CHECK0: define hidden void @HelperB ; CHECK0: define internal void @HelperC ; CHECK0: define internal void @HelperD -; CHECK0: define internal void @C -; CHECK0: define internal void @D +; CHECK0: define void @A +; CHECK0: define void @B -; CHECK1: define hidden void @HelperA -; CHECK1: define hidden void @CallCandidate() -; CHECK1: define internal void @A +; CHECK1: define internal void @HelperD +; CHECK1: define void @D -; CHECK2: define hidden void @HelperB +; CHECK2: define hidden void @CallCandidate ; CHECK2: define internal void @HelperC -; CHECK2: define internal void @HelperD -; CHECK2: define internal void @B +; CHECK2: define void @C @addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate] @@ -51,22 +51,22 @@ define internal void @HelperD() { ret void } -define internal void @A(ptr %call) { +define void @A(ptr %call) { call void @HelperA(ptr %call) ret void } -define internal void @B(ptr %call) { +define void @B(ptr %call) { call void @HelperB(ptr %call) ret void } -define internal void @C() { +define void @C() { call void @HelperC() ret void } -define internal void @D() { +define void @D() { call void @HelperD() ret void } diff --git a/llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll b/llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll deleted file mode 100644 index 01f2f3627f9905..00000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll +++ /dev/null @@ -1,128 +0,0 @@ -; RUN: llvm-split -o %t_s3_ %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=2 -; RUN: llvm-dis -o - %t_s3_0 | FileCheck --check-prefix=SPLIT3-CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t_s3_1 | FileCheck --check-prefix=SPLIT3-CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t_s3_2 | FileCheck --check-prefix=SPLIT3-CHECK2 --implicit-check-not=define %s - -; RUN: llvm-split -o %t_s5_ %s -j 5 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=2 -; RUN: llvm-dis -o - %t_s5_0 | FileCheck --check-prefix=SPLIT5-CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t_s5_1 | FileCheck --check-prefix=SPLIT5-CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t_s5_2 | FileCheck --check-prefix=SPLIT5-CHECK2 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t_s5_3 | FileCheck --check-prefix=SPLIT5-CHECK3 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t_s5_4 | FileCheck --check-prefix=SPLIT5-CHECK4 --implicit-check-not=define %s - -; Test the specifics of the search algorithm. -; This test will change depending on new heuristics we add or remove. - -; -------------------------------------------- - -; SPLIT3-CHECK0: define internal void @HelperA() -; SPLIT3-CHECK0: define internal void @HelperB() -; SPLIT3-CHECK0: define internal void @HelperC() -; SPLIT3-CHECK0: define amdgpu_kernel void @AB() -; SPLIT3-CHECK0: define amdgpu_kernel void @BC() - -; SPLIT3-CHECK1: define amdgpu_kernel void @A() -; SPLIT3-CHECK1: define internal void @HelperA() -; SPLIT3-CHECK1: define amdgpu_kernel void @C() -; SPLIT3-CHECK1: define internal void @HelperC() - -; SPLIT3-CHECK2: define internal void @HelperA() -; SPLIT3-CHECK2: define amdgpu_kernel void @B() -; SPLIT3-CHECK2: define internal void @HelperB() -; SPLIT3-CHECK2: define internal void @HelperC() -; SPLIT3-CHECK2: define amdgpu_kernel void @ABC() - -; -------------------------------------------- - -; SPLIT5-CHECK0: define amdgpu_kernel void @A() -; SPLIT5-CHECK0: define internal void @HelperA() -; SPLIT5-CHECK0: define amdgpu_kernel void @B() -; SPLIT5-CHECK0: define internal void @HelperB() - -; SPLIT5-CHECK1: define internal void @HelperB() -; SPLIT5-CHECK1: define internal void @HelperC() -; SPLIT5-CHECK1: define amdgpu_kernel void @BC - -; SPLIT5-CHECK2: define internal void @HelperA() -; SPLIT5-CHECK2: define internal void @HelperB() -; SPLIT5-CHECK2: define amdgpu_kernel void @AB() - -; SPLIT5-CHECK3: define amdgpu_kernel void @C() -; SPLIT5-CHECK3: define internal void @HelperC() - -; SPLIT5-CHECK4: define internal void @HelperA() -; SPLIT5-CHECK4: define internal void @HelperB() -; SPLIT5-CHECK4: define internal void @HelperC() -; SPLIT5-CHECK4: define amdgpu_kernel void @ABC() - -define amdgpu_kernel void @A() { - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - call void @HelperA() - ret void -} - -define internal void @HelperA() { - store volatile i32 42, ptr null - store volatile i32 42, ptr null - ret void -} - -define amdgpu_kernel void @B() { - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - call void @HelperB() - ret void -} - -define internal void @HelperB() { - store volatile i32 42, ptr null - store volatile i32 42, ptr null - store volatile i32 42, ptr null - ret void -} - -define amdgpu_kernel void @C() { - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - call void @HelperC() - ret void -} - -define internal void @HelperC() { - store volatile i32 42, ptr null - ret void -} - -define amdgpu_kernel void @AB() { - store volatile i32 42, ptr null - call void @HelperA() - call void @HelperB() - ret void -} - -define amdgpu_kernel void @BC() { - store volatile i32 42, ptr null - store volatile i32 42, ptr null - call void @HelperB() - call void @HelperC() - ret void -} - -define amdgpu_kernel void @ABC() { - call void @HelperA() - call void @HelperB() - call void @HelperC() - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll b/llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll deleted file mode 100644 index eae57a19883106..00000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll +++ /dev/null @@ -1,128 +0,0 @@ -; RUN: llvm-split -o %t_s3_ %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=8 -; RUN: llvm-dis -o - %t_s3_0 | FileCheck --check-prefix=SPLIT3-CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t_s3_1 | FileCheck --check-prefix=SPLIT3-CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t_s3_2 | FileCheck --check-prefix=SPLIT3-CHECK2 --implicit-check-not=define %s - -; RUN: llvm-split -o %t_s5_ %s -j 5 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=8 -; RUN: llvm-dis -o - %t_s5_0 | FileCheck --check-prefix=SPLIT5-CHECK0 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t_s5_1 | FileCheck --check-prefix=SPLIT5-CHECK1 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t_s5_2 | FileCheck --check-prefix=SPLIT5-CHECK2 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t_s5_3 | FileCheck --check-prefix=SPLIT5-CHECK3 --implicit-check-not=define %s -; RUN: llvm-dis -o - %t_s5_4 | FileCheck --check-prefix=SPLIT5-CHECK4 --implicit-check-not=define %s - -; Test the specifics of the search algorithm. -; This test will change depending on new heuristics we add or remove. - -; -------------------------------------------- - -; SPLIT3-CHECK0: define internal void @HelperA() -; SPLIT3-CHECK0: define internal void @HelperB() -; SPLIT3-CHECK0: define internal void @HelperC() -; SPLIT3-CHECK0: define amdgpu_kernel void @AB() -; SPLIT3-CHECK0: define amdgpu_kernel void @BC() - -; SPLIT3-CHECK1: define amdgpu_kernel void @A() -; SPLIT3-CHECK1: define internal void @HelperA() -; SPLIT3-CHECK1: define amdgpu_kernel void @C() -; SPLIT3-CHECK1: define internal void @HelperC() - -; SPLIT3-CHECK2: define internal void @HelperA() -; SPLIT3-CHECK2: define amdgpu_kernel void @B() -; SPLIT3-CHECK2: define internal void @HelperB() -; SPLIT3-CHECK2: define internal void @HelperC() -; SPLIT3-CHECK2: define amdgpu_kernel void @ABC() - -; -------------------------------------------- - -; SPLIT5-CHECK0: define amdgpu_kernel void @A() -; SPLIT5-CHECK0: define internal void @HelperA() -; SPLIT5-CHECK0: define amdgpu_kernel void @B() -; SPLIT5-CHECK0: define internal void @HelperB() - -; SPLIT5-CHECK1: define internal void @HelperB() -; SPLIT5-CHECK1: define internal void @HelperC() -; SPLIT5-CHECK1: define amdgpu_kernel void @BC - -; SPLIT5-CHECK2: define internal void @HelperA() -; SPLIT5-CHECK2: define internal void @HelperB() -; SPLIT5-CHECK2: define amdgpu_kernel void @AB() - -; SPLIT5-CHECK3: define amdgpu_kernel void @C() -; SPLIT5-CHECK3: define internal void @HelperC() - -; SPLIT5-CHECK4: define internal void @HelperA() -; SPLIT5-CHECK4: define internal void @HelperB() -; SPLIT5-CHECK4: define internal void @HelperC() -; SPLIT5-CHECK4: define amdgpu_kernel void @ABC() - -define amdgpu_kernel void @A() { - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - call void @HelperA() - ret void -} - -define internal void @HelperA() { - store volatile i32 42, ptr null - store volatile i32 42, ptr null - ret void -} - -define amdgpu_kernel void @B() { - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - call void @HelperB() - ret void -} - -define internal void @HelperB() { - store volatile i32 42, ptr null - store volatile i32 42, ptr null - store volatile i32 42, ptr null - ret void -} - -define amdgpu_kernel void @C() { - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - store volatile i64 42, ptr null - call void @HelperC() - ret void -} - -define internal void @HelperC() { - store volatile i32 42, ptr null - ret void -} - -define amdgpu_kernel void @AB() { - store volatile i32 42, ptr null - call void @HelperA() - call void @HelperB() - ret void -} - -define amdgpu_kernel void @BC() { - store volatile i32 42, ptr null - store volatile i32 42, ptr null - call void @HelperB() - call void @HelperC() - ret void -} - -define amdgpu_kernel void @ABC() { - call void @HelperA() - call void @HelperB() - call void @HelperC() - ret void -} From 2d5613afec0f4afeeb03cfd4edac556a65ad0eaf Mon Sep 17 00:00:00 2001 From: Richard Howell Date: Fri, 30 Aug 2024 05:09:16 -0700 Subject: [PATCH 18/98] [dsymutil] return EXIT_FAILURE when Crashed (#106619) Make dsymutil return a non-zero exit code when crashing during linking. --- llvm/tools/dsymutil/dsymutil.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp index 728f2ed3e62aca..364a7d63d486e1 100644 --- a/llvm/tools/dsymutil/dsymutil.cpp +++ b/llvm/tools/dsymutil/dsymutil.cpp @@ -835,7 +835,7 @@ int dsymutil_main(int argc, char **argv, const llvm::ToolContext &) { if (Crashed) (*Repro)->generate(); - if (!AllOK) + if (!AllOK || Crashed) return EXIT_FAILURE; if (NeedsTempFiles) { From 87a988e881ac92e3d87aae01dc632f33c1fb36aa Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 30 Aug 2024 05:16:57 -0700 Subject: [PATCH 19/98] [SLP]Fix PR106655: Use FinalShuffle for alternate cast nodes. Need to use FinalShuffle function for all vectorized results to correctly produce vectorized value. Fixes https://github.com/llvm/llvm-project/issues/106655 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 31 +-- .../resized-alt-shuffle-after-minbw.ll | 208 ++++++++++++++++++ 2 files changed, 224 insertions(+), 15 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index edb2567fa057b3..345b01b82c6aa4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -13137,7 +13137,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { } bool IsReverseOrder = isReverseOrder(E->ReorderIndices); - auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) { + auto FinalShuffle = [&](Value *V, const TreeEntry *E) { ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this); if (E->getOpcode() == Instruction::Store && E->State == TreeEntry::Vectorize) { @@ -13197,7 +13197,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { PH->getParent()->getFirstInsertionPt()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - V = FinalShuffle(V, E, VecTy); + V = FinalShuffle(V, E); E->VectorizedValue = V; if (PostponedPHIs) @@ -13249,7 +13249,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { if (const TreeEntry *TE = getTreeEntry(V)) V = TE->VectorizedValue; setInsertPointAfterBundle(E); - V = FinalShuffle(V, E, VecTy); + V = FinalShuffle(V, E); E->VectorizedValue = V; return V; } @@ -13259,7 +13259,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { Value *Ptr = LI->getPointerOperand(); LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); Value *NewV = propagateMetadata(V, E->Scalars); - NewV = FinalShuffle(NewV, E, VecTy); + NewV = FinalShuffle(NewV, E); E->VectorizedValue = NewV; return NewV; } @@ -13474,7 +13474,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast) ? InVec : Builder.CreateCast(VecOpcode, InVec, VecTy); - V = FinalShuffle(V, E, VecTy); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; @@ -13518,7 +13518,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { propagateIRFlags(V, E->Scalars, VL0); // Do not cast for cmps. VecTy = cast(V->getType()); - V = FinalShuffle(V, E, VecTy); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; @@ -13571,7 +13571,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { assert(getNumElements(Cond->getType()) == TrueNumElements && "Cannot vectorize Instruction::Select"); Value *V = Builder.CreateSelect(Cond, True, False); - V = FinalShuffle(V, E, VecTy); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; @@ -13593,7 +13593,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); - V = FinalShuffle(V, E, VecTy); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; @@ -13611,7 +13611,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { } Value *V = Builder.CreateFreeze(Op); - V = FinalShuffle(V, E, VecTy); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; @@ -13655,7 +13655,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { auto *CI = dyn_cast(Op); return CI && CI->getValue().countr_one() >= It->second.first; })) { - V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy); + V = FinalShuffle(I == 0 ? RHS : LHS, E); E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -13688,7 +13688,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { I->setHasNoUnsignedWrap(/*b=*/false); } - V = FinalShuffle(V, E, VecTy); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; @@ -13780,7 +13780,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { } Value *V = propagateMetadata(NewLI, E->Scalars); - V = FinalShuffle(V, E, VecTy); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -13794,7 +13794,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { if (VecValue->getType() != VecTy) VecValue = Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0)); - VecValue = FinalShuffle(VecValue, E, VecTy); + VecValue = FinalShuffle(VecValue, E); Value *Ptr = SI->getPointerOperand(); Instruction *ST; @@ -13859,7 +13859,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { V = propagateMetadata(I, GEPs); } - V = FinalShuffle(V, E, VecTy); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; @@ -13941,7 +13941,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { Value *V = Builder.CreateCall(CF, OpVecs, OpBundles); propagateIRFlags(V, E->Scalars, VL0); - V = FinalShuffle(V, E, VecTy); + V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; @@ -14039,6 +14039,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { "Expected same type as operand."); if (auto *I = dyn_cast(LHS)) LHS = propagateMetadata(I, E->Scalars); + LHS = FinalShuffle(LHS, E); E->VectorizedValue = LHS; ++NumVectorInstructions; return LHS; diff --git a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll new file mode 100644 index 00000000000000..56281424c7114a --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll @@ -0,0 +1,208 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -slp-vectorize-hor=false < %s | FileCheck %s + +define void @func(i32 %0) { +; CHECK-LABEL: define void @func( +; CHECK-SAME: i32 [[TMP0:%.*]]) { +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shl i32 [[TMP0]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP9]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <32 x i32> [[TMP11]], <32 x i32> , <32 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <32 x i32> [[TMP12]], i32 0, i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP13]], <8 x i32> zeroinitializer, i64 16) +; CHECK-NEXT: [[TMP15:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP14]], <4 x i32> zeroinitializer, i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v2i32(<32 x i32> [[TMP15]], <2 x i32> zeroinitializer, i64 14) +; CHECK-NEXT: [[TMP17:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v2i32(<32 x i32> [[TMP16]], <2 x i32> zeroinitializer, i64 28) +; CHECK-NEXT: [[TMP18:%.*]] = or <32 x i32> [[TMP8]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = sext <32 x i32> [[TMP18]] to <32 x i64> +; CHECK-NEXT: [[TMP20:%.*]] = icmp slt <32 x i64> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <32 x i1> [[TMP20]], i32 31 +; CHECK-NEXT: [[TMP22:%.*]] = and i1 false, [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <32 x i1> [[TMP20]], i32 30 +; CHECK-NEXT: [[TMP24:%.*]] = and i1 false, [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <32 x i1> [[TMP20]], i32 29 +; CHECK-NEXT: [[TMP26:%.*]] = and i1 false, [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <32 x i1> [[TMP20]], i32 28 +; CHECK-NEXT: [[TMP28:%.*]] = and i1 false, [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <32 x i1> [[TMP20]], i32 27 +; CHECK-NEXT: [[TMP30:%.*]] = and i1 false, [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <32 x i1> [[TMP20]], i32 26 +; CHECK-NEXT: [[TMP32:%.*]] = and i1 false, [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <32 x i1> [[TMP20]], i32 25 +; CHECK-NEXT: [[TMP34:%.*]] = and i1 false, [[TMP33]] +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <32 x i1> [[TMP20]], i32 24 +; CHECK-NEXT: [[TMP36:%.*]] = and i1 false, [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <32 x i1> [[TMP20]], i32 23 +; CHECK-NEXT: [[TMP38:%.*]] = and i1 false, [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <32 x i1> [[TMP20]], i32 22 +; CHECK-NEXT: [[TMP40:%.*]] = and i1 false, [[TMP39]] +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <32 x i1> [[TMP20]], i32 21 +; CHECK-NEXT: [[TMP42:%.*]] = and i1 false, [[TMP41]] +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <32 x i1> [[TMP20]], i32 20 +; CHECK-NEXT: [[TMP44:%.*]] = and i1 false, [[TMP43]] +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <32 x i1> [[TMP20]], i32 19 +; CHECK-NEXT: [[TMP46:%.*]] = and i1 false, [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <32 x i1> [[TMP20]], i32 18 +; CHECK-NEXT: [[TMP48:%.*]] = and i1 false, [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <32 x i1> [[TMP20]], i32 17 +; CHECK-NEXT: [[TMP50:%.*]] = and i1 false, [[TMP49]] +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <32 x i1> [[TMP20]], i32 16 +; CHECK-NEXT: [[TMP52:%.*]] = and i1 false, [[TMP51]] +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <32 x i1> [[TMP20]], i32 15 +; CHECK-NEXT: [[TMP54:%.*]] = and i1 false, [[TMP53]] +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <32 x i1> [[TMP20]], i32 14 +; CHECK-NEXT: [[TMP56:%.*]] = and i1 false, [[TMP55]] +; CHECK-NEXT: [[TMP57:%.*]] = extractelement <32 x i1> [[TMP20]], i32 13 +; CHECK-NEXT: [[TMP58:%.*]] = and i1 false, [[TMP57]] +; CHECK-NEXT: [[TMP59:%.*]] = extractelement <32 x i1> [[TMP20]], i32 12 +; CHECK-NEXT: [[TMP60:%.*]] = and i1 false, [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = extractelement <32 x i1> [[TMP20]], i32 11 +; CHECK-NEXT: [[TMP62:%.*]] = and i1 false, [[TMP61]] +; CHECK-NEXT: [[TMP63:%.*]] = extractelement <32 x i1> [[TMP20]], i32 10 +; CHECK-NEXT: [[TMP64:%.*]] = and i1 false, [[TMP63]] +; CHECK-NEXT: [[TMP65:%.*]] = extractelement <32 x i1> [[TMP20]], i32 9 +; CHECK-NEXT: [[TMP66:%.*]] = and i1 false, [[TMP65]] +; CHECK-NEXT: [[TMP67:%.*]] = extractelement <32 x i1> [[TMP20]], i32 8 +; CHECK-NEXT: [[TMP68:%.*]] = and i1 false, [[TMP67]] +; CHECK-NEXT: [[TMP69:%.*]] = extractelement <32 x i1> [[TMP20]], i32 7 +; CHECK-NEXT: [[TMP70:%.*]] = and i1 false, [[TMP69]] +; CHECK-NEXT: [[TMP71:%.*]] = extractelement <32 x i1> [[TMP20]], i32 6 +; CHECK-NEXT: [[TMP72:%.*]] = and i1 false, [[TMP71]] +; CHECK-NEXT: [[TMP73:%.*]] = extractelement <32 x i1> [[TMP20]], i32 5 +; CHECK-NEXT: [[TMP74:%.*]] = and i1 false, [[TMP73]] +; CHECK-NEXT: [[TMP75:%.*]] = extractelement <32 x i1> [[TMP20]], i32 4 +; CHECK-NEXT: [[TMP76:%.*]] = and i1 false, [[TMP75]] +; CHECK-NEXT: [[TMP77:%.*]] = extractelement <32 x i32> [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP78:%.*]] = sext i32 [[TMP77]] to i64 +; CHECK-NEXT: [[TMP79:%.*]] = getelementptr float, ptr addrspace(1) null, i64 [[TMP78]] +; CHECK-NEXT: ret void +; + %2 = shl i32 %0, 0 + %3 = sext i32 %2 to i64 + %4 = shl i32 0, 0 + %5 = sext i32 %4 to i64 + %6 = or i32 0, 0 + %7 = or i32 0, 0 + %8 = zext i32 %6 to i64 + %9 = zext i32 %7 to i64 + %10 = zext i32 0 to i64 + %11 = zext i32 0 to i64 + %12 = zext i32 0 to i64 + %13 = zext i32 0 to i64 + %14 = zext i32 0 to i64 + %15 = zext i32 0 to i64 + %16 = zext i32 0 to i64 + %17 = zext i32 0 to i64 + %18 = zext i32 0 to i64 + %19 = zext i32 0 to i64 + %20 = zext i32 0 to i64 + %21 = zext i32 0 to i64 + %22 = zext i32 0 to i64 + %23 = zext i32 0 to i64 + %24 = zext i32 0 to i64 + %25 = zext i32 0 to i64 + %26 = zext i32 0 to i64 + %27 = or i64 %3, 0 + %28 = or i64 %3, %8 + %29 = or i64 %3, %9 + %30 = or i64 %3, %10 + %31 = or i64 %3, %11 + %32 = or i64 %3, %12 + %33 = or i64 %3, %13 + %34 = or i64 %3, %14 + %35 = or i64 %3, %15 + %36 = or i64 %3, %16 + %37 = or i64 %3, %17 + %38 = or i64 %3, %18 + %39 = or i64 %3, %19 + %40 = or i64 %3, %20 + %41 = or i64 %3, %21 + %42 = or i64 %3, %22 + %43 = or i64 %3, %23 + %44 = or i64 %3, %24 + %45 = or i64 %3, %25 + %46 = or i64 %3, 0 + %47 = or i64 %3, 0 + %48 = or i64 %3, 0 + %49 = or i64 %3, 0 + %50 = or i64 %3, 0 + %51 = or i64 %3, 0 + %52 = or i64 %3, 0 + %53 = or i64 %3, 0 + %54 = or i64 %3, 0 + %55 = or i64 %3, 0 + %56 = or i64 %3, 0 + %57 = or i64 %3, 0 + %58 = or i64 %3, 0 + %59 = icmp slt i64 %28, 0 + %60 = icmp slt i64 %29, 0 + %61 = icmp slt i64 %30, 0 + %62 = icmp slt i64 %31, 0 + %63 = icmp slt i64 %32, 0 + %64 = icmp slt i64 %33, 0 + %65 = icmp slt i64 %34, 0 + %66 = icmp slt i64 %35, 0 + %67 = icmp slt i64 %36, 0 + %68 = icmp slt i64 %37, 0 + %69 = icmp slt i64 %38, 0 + %70 = icmp slt i64 %39, 0 + %71 = icmp slt i64 %40, 0 + %72 = icmp slt i64 %41, 0 + %73 = icmp slt i64 %42, 0 + %74 = icmp slt i64 %43, 0 + %75 = icmp slt i64 %44, 0 + %76 = icmp slt i64 %45, 0 + %77 = icmp slt i64 %46, 0 + %78 = icmp slt i64 %47, 0 + %79 = icmp slt i64 %48, 0 + %80 = icmp slt i64 %49, 0 + %81 = icmp slt i64 %50, 0 + %82 = icmp slt i64 %51, 0 + %83 = icmp slt i64 %52, 0 + %84 = icmp slt i64 %53, 0 + %85 = icmp slt i64 %54, 0 + %86 = icmp slt i64 %55, 0 + %87 = icmp slt i64 %56, 0 + %88 = icmp slt i64 %57, 0 + %89 = icmp slt i64 %58, 0 + %90 = and i1 false, %59 + %91 = and i1 false, %60 + %92 = and i1 false, %61 + %93 = and i1 false, %62 + %94 = and i1 false, %63 + %95 = and i1 false, %64 + %96 = and i1 false, %65 + %97 = and i1 false, %66 + %98 = and i1 false, %67 + %99 = and i1 false, %68 + %100 = and i1 false, %69 + %101 = and i1 false, %70 + %102 = and i1 false, %71 + %103 = and i1 false, %72 + %104 = and i1 false, %73 + %105 = and i1 false, %74 + %106 = and i1 false, %75 + %107 = and i1 false, %76 + %108 = icmp eq i32 %2, 0 + %109 = and i1 false, %77 + %110 = and i1 false, %78 + %111 = and i1 false, %79 + %112 = and i1 false, %80 + %113 = and i1 false, %81 + %114 = and i1 false, %82 + %115 = and i1 false, %83 + %116 = and i1 false, %84 + %117 = and i1 false, %85 + %118 = and i1 false, %86 + %119 = or i64 %5, %26 + %120 = getelementptr float, ptr addrspace(1) null, i64 %119 + %121 = icmp slt i64 %119, 0 + ret void +} From ce5620ba9a5bf48bce4e49933aec531c70c54aeb Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Fri, 30 Aug 2024 13:30:23 +0100 Subject: [PATCH 20/98] [LLVM][VPlan] Pick more optimal initial value for VPBlend. (#104019) By choosing an initial value whose mask is only used by the blend we can remove the need for the mask entirely. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 12 ++ .../LoopVectorize/AArch64/masked-call.ll | 48 +++--- .../AArch64/scalable-strict-fadd.ll | 28 ++-- .../LoopVectorize/AArch64/sve-tail-folding.ll | 23 ++- .../LoopVectorize/if-conversion-nest.ll | 146 ++++++++++++++++-- .../Transforms/LoopVectorize/if-reduction.ll | 5 +- .../test/Transforms/LoopVectorize/phi-cost.ll | 4 +- .../LoopVectorize/reduction-inloop-cond.ll | 4 +- .../LoopVectorize/single-value-blend-phis.ll | 30 ++-- 9 files changed, 207 insertions(+), 93 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ee7c7cea0b7670..9796ee64f6ef90 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -878,6 +878,17 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { // value with the others blended into it. unsigned StartIndex = 0; + for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { + // If a value's mask is used only by the blend then is can be deadcoded. + // TODO: Find the most expensive mask that can be deadcoded, or a mask + // that's used by multiple blends where it can be removed from them all. + VPValue *Mask = Blend->getMask(I); + if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) { + StartIndex = I; + break; + } + } + SmallVector OperandsWithMask; OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex)); @@ -956,6 +967,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) && X == X1 && Y == Y1) { R.getVPSingleValue()->replaceAllUsesWith(X); + R.eraseFromParent(); return; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index f467f3cf262d2f..93034f4dbe56ec 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -215,16 +215,14 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[TMP6:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) ; TFCOMMON-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer ; TFCOMMON-NEXT: [[TMP8:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP7]]) -; TFCOMMON-NEXT: [[TMP9:%.*]] = xor [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFCOMMON-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer -; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], zeroinitializer, [[TMP8]] -; TFCOMMON-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP11]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP8]], zeroinitializer +; TFCOMMON-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) ; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) -; TFCOMMON-NEXT: [[TMP12:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFCOMMON-NEXT: [[TMP13:%.*]] = extractelement [[TMP12]], i32 0 -; TFCOMMON-NEXT: br i1 [[TMP13]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; TFCOMMON-NEXT: [[TMP10:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFCOMMON-NEXT: [[TMP11:%.*]] = extractelement [[TMP10]], i32 0 +; TFCOMMON-NEXT: br i1 [[TMP11]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; TFCOMMON: for.cond.cleanup: ; TFCOMMON-NEXT: ret void ; @@ -259,27 +257,23 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP12]], zeroinitializer ; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP13]]) ; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD3]], [[TMP14]]) -; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = xor [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = xor [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP17]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP18]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select [[TMP19]], zeroinitializer, [[TMP15]] -; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select [[TMP20]], zeroinitializer, [[TMP16]] -; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i64 [[TMP23]] -; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP21]], i32 8, [[ACTIVE_LANE_MASK]]) -; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI4]], ptr [[TMP24]], i32 8, [[ACTIVE_LANE_MASK2]]) +; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select [[TMP13]], [[TMP15]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select [[TMP14]], [[TMP16]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i64 [[TMP19]] +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP17]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI4]], ptr [[TMP20]], i32 8, [[ACTIVE_LANE_MASK2]]) ; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] -; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = add i64 [[INDEX_NEXT]], [[TMP26]] +; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = add i64 [[INDEX_NEXT]], [[TMP22]] ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP27]], i64 1025) -; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement [[TMP28]], i32 0 -; TFA_INTERLEAVE-NEXT: br i1 [[TMP29]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP23]], i64 1025) +; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 +; TFA_INTERLEAVE-NEXT: br i1 [[TMP25]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; TFA_INTERLEAVE: for.cond.cleanup: ; TFA_INTERLEAVE-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll index f922873210b052..66d001498e457b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -1216,7 +1216,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 ; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP10]] ; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 @@ -1226,41 +1226,39 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP10]] ; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP15]], i32 0 ; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, [[TMP14]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP17]], zeroinitializer -; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select [[TMP18]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD1]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = select [[ACTIVE_LANE_MASK]], [[PREDPHI]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP20]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP19]]) +; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select [[TMP14]], [[WIDE_MASKED_LOAD1]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = select [[ACTIVE_LANE_MASK]], [[PREDPHI]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP18]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP17]]) ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = extractelement [[TMP21]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = extractelement [[TMP19]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[ENTRY]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[ENTRY]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; CHECK-ORDERED-TF-NEXT: [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-ORDERED-TF-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP23]], 0.000000e+00 +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP21]], 0.000000e+00 ; CHECK-ORDERED-TF-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK-ORDERED-TF: if.then: ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 ; CHECK-ORDERED-TF-NEXT: br label [[FOR_INC]] ; CHECK-ORDERED-TF: for.inc: -; CHECK-ORDERED-TF-NEXT: [[PHI:%.*]] = phi float [ [[TMP24]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[PHI:%.*]] = phi float [ [[TMP22]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]] ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[RDX]] ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index f6a6d021f03c9f..6fa1e7fbbac602 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -467,16 +467,15 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr ; CHECK-NEXT: [[TMP14:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP14]], zeroinitializer ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], i32 4, [[TMP15]], poison) -; CHECK-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], zeroinitializer, [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[PREDPHI]], ptr [[TMP18]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[WIDE_MASKED_GATHER]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[PREDPHI]], ptr [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP19:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP20:%.*]] = extractelement [[TMP19]], i32 0 -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -485,14 +484,14 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr ; CHECK: for.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP21]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP20]], 0 ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[SRC]], align 4 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[TMP22]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ] +; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[TMP21]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]] ; CHECK-NEXT: store i32 [[VAL_0]], ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll index d19ca172a8c0a8..8b0c99b353c8b7 100644 --- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll @@ -33,18 +33,15 @@ define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META3]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = and <4 x i1> [[TMP7]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD2]], -; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> , <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> , <4 x i32> -; CHECK-NEXT: [[PREDPHI3:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP12]], <4 x i32> [[PREDPHI]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP9:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD2]], +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[PREDPHI3:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[PREDPHI]], <4 x i32> ; CHECK-NEXT: store <4 x i32> [[PREDPHI3]], ptr [[TMP5]], align 4, !alias.scope [[META0]], !noalias [[META3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -54,16 +51,16 @@ define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] ; CHECK-NEXT: br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[IF_END14]] ; CHECK: if.then: -; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP15]], 19 +; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], 19 ; CHECK-NEXT: br i1 [[CMP6]], label [[IF_END14]], label [[IF_ELSE:%.*]] ; CHECK: if.else: -; CHECK-NEXT: [[CMP10:%.*]] = icmp slt i32 [[TMP16]], 4 +; CHECK-NEXT: [[CMP10:%.*]] = icmp slt i32 [[TMP13]], 4 ; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP10]], i32 4, i32 5 ; CHECK-NEXT: br label [[IF_END14]] ; CHECK: if.end14: @@ -112,3 +109,122 @@ for.end: ret i32 undef } +; As above but with multiple variables set per block. +define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) { +; CHECK-LABEL: @multi_variable_if_nest( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP26]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[N]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483644 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META12:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META12]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = and <4 x i1> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD2]], +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[PREDPHI3:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP12]], <4 x i32> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP13]], <4 x i32> [[PREDPHI4]] +; CHECK-NEXT: store <4 x i32> [[PREDPHI3]], ptr [[TMP5]], align 4, !alias.scope [[META9]], !noalias [[META12]] +; CHECK-NEXT: store <4 x i32> [[PREDPHI5]], ptr [[TMP6]], align 4, !alias.scope [[META12]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP16]], [[TMP17]] +; CHECK-NEXT: br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[IF_END14]] +; CHECK: if.then: +; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP16]], 19 +; CHECK-NEXT: br i1 [[CMP6]], label [[IF_END14]], label [[IF_ELSE:%.*]] +; CHECK: if.else: +; CHECK-NEXT: [[CMP10:%.*]] = icmp slt i32 [[TMP17]], 4 +; CHECK-NEXT: [[X_ELSE:%.*]] = select i1 [[CMP10]], i32 4, i32 5 +; CHECK-NEXT: [[Y_ELSE:%.*]] = select i1 [[CMP10]], i32 6, i32 11 +; CHECK-NEXT: br label [[IF_END14]] +; CHECK: if.end14: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 9, [[FOR_BODY]] ], [ 3, [[IF_THEN]] ], [ [[X_ELSE]], [[IF_ELSE]] ] +; CHECK-NEXT: [[Y_0:%.*]] = phi i32 [ 18, [[FOR_BODY]] ], [ 7, [[IF_THEN]] ], [ [[Y_ELSE]], [[IF_ELSE]] ] +; CHECK-NEXT: store i32 [[X_0]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: store i32 [[Y_0]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 undef +; +entry: + %cmp26 = icmp sgt i32 %n, 0 + br i1 %cmp26, label %for.body, label %for.end + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %if.end14 ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %cmp3 = icmp sgt i32 %0, %1 + br i1 %cmp3, label %if.then, label %if.end14 + +if.then: + %cmp6 = icmp sgt i32 %0, 19 + br i1 %cmp6, label %if.end14, label %if.else + +if.else: + %cmp10 = icmp slt i32 %1, 4 + %x.else = select i1 %cmp10, i32 4, i32 5 + %y.else = select i1 %cmp10, i32 6, i32 11 + br label %if.end14 + +if.end14: + %x.0 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %x.else, %if.else ] ; <------------- A PHI with 3 entries that we can still vectorize. + %y.0 = phi i32 [ 18, %for.body ], [ 7, %if.then ], [ %y.else, %if.else ] ; <------------- A PHI with 3 entries that we can still vectorize. + store i32 %x.0, ptr %arrayidx, align 4 + store i32 %y.0, ptr %arrayidx2, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret i32 undef +} diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll index e9761a60fd6ebe..0d5871e24c5247 100644 --- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll @@ -678,9 +678,8 @@ for.end: ; preds = %for.inc, %entry ; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], ; CHECK-DAG: %[[ADD:.*]] = fadd fast <4 x float> -; CHECK-DAG: %[[C12:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C2]], <4 x i1> zeroinitializer -; CHECK: %[[C22:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C21]], <4 x i1> zeroinitializer -; CHECK: %[[S1:.*]] = select <4 x i1> %[[C12]], <4 x float> %[[SUB]], <4 x float> %[[ADD]] +; CHECK-DAG: %[[C22:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C21]], <4 x i1> zeroinitializer +; CHECK: %[[S1:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[ADD]], <4 x float> %[[SUB]] ; CHECK: %[[S2:.*]] = select <4 x i1> %[[C22]], {{.*}} <4 x float> %[[S1]] define float @fcmp_fadd_fsub(ptr nocapture readonly %a, i32 %n) nounwind readonly { entry: diff --git a/llvm/test/Transforms/LoopVectorize/phi-cost.ll b/llvm/test/Transforms/LoopVectorize/phi-cost.ll index e571b624ed1940..8d407c969b5278 100644 --- a/llvm/test/Transforms/LoopVectorize/phi-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/phi-cost.ll @@ -49,8 +49,8 @@ for.end: ; CHECK: define void @phi_three_incoming_values( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[PREDPHI:%.*]] = select <2 x i1> {{.*}}, <2 x i32> , <2 x i32> -; CHECK: [[PREDPHI7:%.*]] = select <2 x i1> {{.*}}, <2 x i32> {{.*}}, <2 x i32> [[PREDPHI]] +; CHECK: [[PREDPHI:%.*]] = select <2 x i1> {{.*}}, <2 x i32> {{.*}}, <2 x i32> +; CHECK: [[PREDPHI7:%.*]] = select <2 x i1> {{.*}}, <2 x i32> [[PREDPHI]], <2 x i32> ; CHECK: store <2 x i32> [[PREDPHI7]], ptr {{.*}} ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll index c50bcf8ae88f5c..2e111332ef6c48 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll @@ -587,9 +587,7 @@ define i64 @nested_cond_and(ptr noalias nocapture readonly %a, ptr noalias nocap ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK: pred.load.continue14: ; CHECK-NEXT: [[TMP46:%.*]] = phi <4 x i64> [ [[TMP41]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP45]], [[PRED_LOAD_IF13]] ] -; CHECK-NEXT: [[TMP47:%.*]] = xor <4 x i1> [[TMP25]], -; CHECK-NEXT: [[TMP48:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP47]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP48]], <4 x i64> [[TMP24]], <4 x i64> [[TMP46]] +; CHECK-NEXT: [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP26]], <4 x i64> [[TMP46]], <4 x i64> [[TMP24]] ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[PREDPHI_V]], <4 x i64> ; CHECK-NEXT: [[PREDPHI15]] = and <4 x i64> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll index 8ee12cc2241c35..6407583061e601 100644 --- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll @@ -111,17 +111,16 @@ define void @single_incoming_phi_with_blend_mask(i64 %a, i64 %b) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP6]], -; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP8]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP9]], <2 x i16> [[WIDE_LOAD]], <2 x i16> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP3]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP8]], <2 x i16> zeroinitializer, <2 x i16> [[WIDE_LOAD]] ; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP7]], <2 x i16> , <2 x i16> [[PREDPHI]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store <2 x i16> [[PREDPHI1]], ptr [[TMP11]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0 +; CHECK-NEXT: store <2 x i16> [[PREDPHI1]], ptr [[TMP10]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -304,17 +303,16 @@ define void @single_incoming_needs_predication(i64 %a, i64 %b) { ; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[TMP15:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[TMP15]], -; CHECK-NEXT: [[TMP18:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP17]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP18]], <2 x i16> [[TMP14]], <2 x i16> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[TMP2]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP17]], <2 x i16> zeroinitializer, <2 x i16> [[TMP14]] ; CHECK-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP16]], <2 x i16> , <2 x i16> [[PREDPHI]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP0]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i32 0 -; CHECK-NEXT: store <2 x i16> [[PREDPHI3]], ptr [[TMP20]], align 2 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP18]], i32 0 +; CHECK-NEXT: store <2 x i16> [[PREDPHI3]], ptr [[TMP19]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: From 64f19951718075fdd2d2b6d072e8e5ca15a1c6c4 Mon Sep 17 00:00:00 2001 From: Danial Klimkin Date: Fri, 30 Aug 2024 14:59:00 +0200 Subject: [PATCH 21/98] Fix stack overflow in allPathsGoThroughCold past 6b11573b8c5e (#106384) Recursion here causes stack overflow on large inputs. Fixing by unrolling via a stack. --- llvm/lib/Transforms/IPO/FunctionAttrs.cpp | 80 +++++++++++------------ 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 603a1565e48c45..79746201133bdd 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -1762,54 +1762,52 @@ static void addNoReturnAttrs(const SCCNodeSet &SCCNodes, } } -static bool -allBBPathsGoThroughCold(BasicBlock *BB, - SmallDenseMap &Visited) { - // If BB contains a cold callsite this path through the CG is cold. - // Ignore whether the instructions actually are guranteed to transfer - // execution. Divergent behavior is considered unlikely. - if (any_of(*BB, [](Instruction &I) { - if (auto *CB = dyn_cast(&I)) - return CB->hasFnAttr(Attribute::Cold); - return false; - })) { - Visited[BB] = true; - return true; - } - - auto Succs = successors(BB); - // We found a path that doesn't go through any cold callsite. - if (Succs.empty()) - return false; +static bool allPathsGoThroughCold(Function &F) { + SmallDenseMap ColdPaths; + ColdPaths[&F.front()] = false; + SmallVector Jobs; + Jobs.push_back(&F.front()); + + while (!Jobs.empty()) { + BasicBlock *BB = Jobs.pop_back_val(); + + // If block contains a cold callsite this path through the CG is cold. + // Ignore whether the instructions actually are guaranteed to transfer + // execution. Divergent behavior is considered unlikely. + if (any_of(*BB, [](Instruction &I) { + if (auto *CB = dyn_cast(&I)) + return CB->hasFnAttr(Attribute::Cold); + return false; + })) { + ColdPaths[BB] = true; + continue; + } - // We didn't find a cold callsite in this BB, so check that all successors - // contain a cold callsite (or that their successors do). - // Potential TODO: We could use static branch hints to assume certain - // successor paths are inherently cold, irrespective of if they contain a cold - // callsite. - for (auto *Succ : Succs) { - // Start with false, this is necessary to ensure we don't turn loops into - // cold. - auto R = Visited.try_emplace(Succ, false); - if (!R.second) { - if (R.first->second) - continue; + auto Succs = successors(BB); + // We found a path that doesn't go through any cold callsite. + if (Succs.empty()) return false; + + // We didn't find a cold callsite in this BB, so check that all successors + // contain a cold callsite (or that their successors do). + // Potential TODO: We could use static branch hints to assume certain + // successor paths are inherently cold, irrespective of if they contain a + // cold callsite. + for (BasicBlock *Succ : Succs) { + // Start with false, this is necessary to ensure we don't turn loops into + // cold. + auto [Iter, Inserted] = ColdPaths.try_emplace(Succ, false); + if (!Inserted) { + if (Iter->second) + continue; + return false; + } + Jobs.push_back(Succ); } - if (!allBBPathsGoThroughCold(Succ, Visited)) - return false; - Visited[Succ] = true; } - return true; } -static bool allPathsGoThroughCold(Function &F) { - SmallDenseMap Visited; - Visited[&F.front()] = false; - return allBBPathsGoThroughCold(&F.front(), Visited); -} - // Set the cold function attribute if possible. static void addColdAttrs(const SCCNodeSet &SCCNodes, SmallSet &Changed) { From ceb613a8bed218e2c98cd4fad3fd2a4a3217bd77 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 30 Aug 2024 12:28:34 +0100 Subject: [PATCH 22/98] [RISCV] Add full test coverage for acos/asin/atan and cosh/sinh/tanh intrinsics to support #106584 --- .../SLPVectorizer/RISCV/math-function.ll | 912 ++++++++++++++++++ 1 file changed, 912 insertions(+) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll index 059e4c38b519bd..6fbd05aaedfe5b 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll @@ -597,6 +597,690 @@ entry: ret <4 x float> %vecins.3 } +declare float @cosf(float) readonly nounwind willreturn + +; We can not vectorized cos cosce RISCV has no such instruction. +define <4 x float> @cos_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @cos_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @cos_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @cosf(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @cosf(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @cosf(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @cosf(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + +declare float @llvm.cos.f32(float) + +; We can not vectorized cos cosce RISCV has no such instruction. +define <4 x float> @int_cos_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @int_cos_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @int_cos_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.cos.f32(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.cos.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.cos.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.cos.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + +declare float @acosf(float) readonly nounwind willreturn + +; We can not vectorized acos cosce RISCV has no such instruction. +define <4 x float> @acos_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @acos_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @acosf(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @acosf(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @acosf(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @acos_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @acosf(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @acosf(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @acosf(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @acosf(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @acosf(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @acosf(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @acosf(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + +declare float @llvm.acos.f32(float) + +; We can not vectorized acos cosce RISCV has no such instruction. +define <4 x float> @int_acos_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @int_acos_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @int_acos_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.acos.f32(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.acos.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.acos.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.acos.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + +declare float @tanf(float) readonly nounwind willreturn + +; We can not vectorized tan tance RISCV has no such instruction. +define <4 x float> @tan_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @tan_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @tanf(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @tan_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @tanf(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @tanf(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @tanf(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @tanf(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @tanf(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + +declare float @llvm.tan.f32(float) + +; We can not vectorized tan tance RISCV has no such instruction. +define <4 x float> @int_tan_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @int_tan_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @int_tan_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.tan.f32(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.tan.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.tan.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.tan.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + +declare float @atanf(float) readonly nounwind willreturn + +; We can not vectorized atan tance RISCV has no such instruction. +define <4 x float> @atan_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @atan_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @atanf(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @atanf(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @atanf(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @atanf(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @atan_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @atanf(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @atanf(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @atanf(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @atanf(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @atanf(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @atanf(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @atanf(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @atanf(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + +declare float @llvm.atan.f32(float) + +; We can not vectorized atan tance RISCV has no such instruction. +define <4 x float> @int_atan_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @int_atan_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @int_atan_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.atan.f32(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.atan.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.atan.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.atan.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + +declare float @sinhf(float) readonly nounwind willreturn + +; We can not vectorized sinh since RISCV has no such instruction. +define <4 x float> @sinh_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @sinh_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @sinhf(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @sinhf(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @sinhf(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @sinhf(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @sinh_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @sinhf(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @sinhf(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @sinhf(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @sinhf(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @sinhf(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @sinhf(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @sinhf(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @sinhf(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + +declare float @llvm.sinh.f32(float) + +; We can not vectorized sinh since RISCV has no such instruction. +define <4 x float> @int_sinh_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @int_sinh_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @int_sinh_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.sinh.f32(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.sinh.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.sinh.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.sinh.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + +declare float @asinhf(float) readonly nounwind willreturn + +; We can not vectorized asinh since RISCV has no such instruction. +define <4 x float> @asinh_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @asinh_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @asinhf(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @asinhf(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @asinhf(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @asinhf(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @asinh_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @asinhf(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @asinhf(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @asinhf(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @asinhf(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @asinhf(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @asinhf(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @asinhf(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @asinhf(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + +declare float @llvm.asinh.f32(float) + +; We can not vectorized asinh since RISCV has no such instruction. +define <4 x float> @int_asinh_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @int_asinh_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @int_asinh_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.asinh.f32(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.asinh.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.asinh.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.asinh.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + declare float @coshf(float) readonly nounwind willreturn ; We can not vectorized cosh since RISCV has no such instruction. @@ -711,6 +1395,234 @@ entry: ret <4 x float> %vecins.3 } +declare float @acoshf(float) readonly nounwind willreturn + +; We can not vectorized acosh since RISCV has no such instruction. +define <4 x float> @acosh_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @acosh_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @acoshf(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @acoshf(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @acoshf(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @acoshf(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @acosh_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @acoshf(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @acoshf(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @acoshf(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @acoshf(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @acoshf(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @acoshf(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @acoshf(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @acoshf(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + +declare float @llvm.acosh.f32(float) + +; We can not vectorized acosh since RISCV has no such instruction. +define <4 x float> @int_acosh_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @int_acosh_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @int_acosh_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.acosh.f32(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.acosh.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.acosh.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.acosh.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + +declare float @tanhf(float) readonly nounwind willreturn + +; We can not vectorized tanh since RISCV has no such instruction. +define <4 x float> @tanh_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @tanh_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @tanhf(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @tanhf(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @tanhf(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @tanhf(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @tanh_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @tanhf(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @tanhf(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @tanhf(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @tanhf(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @tanhf(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @tanhf(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @tanhf(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @tanhf(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + +declare float @llvm.tanh.f32(float) + +; We can not vectorized tanh since RISCV has no such instruction. +define <4 x float> @int_tanh_4x(ptr %a) { +; CHECK-LABEL: define <4 x float> @int_tanh_4x +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT]]) +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]]) +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; +; DEFAULT-LABEL: define <4 x float> @int_tanh_4x +; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16 +; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT]]) +; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]]) +; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %1 = tail call fast float @llvm.tanh.f32(float %vecext) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %2 = tail call fast float @llvm.tanh.f32(float %vecext.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %3 = tail call fast float @llvm.tanh.f32(float %vecext.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %4 = tail call fast float @llvm.tanh.f32(float %vecext.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} + declare float @atanhf(float) readonly nounwind willreturn ; We can not vectorized atanh since RISCV has no such instruction. From 8586d0330e36b22496f9ba5ed116bc1aac5a1f28 Mon Sep 17 00:00:00 2001 From: vdonaldson <37090318+vdonaldson@users.noreply.github.com> Date: Fri, 30 Aug 2024 09:07:30 -0400 Subject: [PATCH 23/98] [flang] Don't generate empty else blocks (#106618) Code lowering always generates fir.if else blocks for source level if statements, whether needed or not. Change this to only generate else blocks that are needed. --- flang/lib/Lower/Bridge.cpp | 7 +++++-- flang/test/HLFIR/assumed_shape_with_value_keyword.f90 | 2 -- flang/test/Lower/HLFIR/select-rank.f90 | 1 - flang/test/Lower/Intrinsics/system_clock.f90 | 2 -- flang/test/Lower/OpenMP/master.f90 | 2 +- flang/test/Lower/OpenMP/unstructured.f90 | 1 - flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 | 1 - flang/test/Lower/OpenMP/wsloop-reduction-max.f90 | 1 - flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 | 1 - flang/test/Lower/OpenMP/wsloop-reduction-min.f90 | 1 - flang/test/Lower/OpenMP/wsloop-variable.f90 | 1 - 11 files changed, 6 insertions(+), 14 deletions(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 90943fa92493ce..e5ccf659c3f8ed 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -2349,8 +2349,11 @@ class FirConverter : public Fortran::lower::AbstractConverter { fir::IfOp topIfOp, currentIfOp; for (Fortran::lower::pft::Evaluation &e : eval.getNestedEvaluations()) { auto genIfOp = [&](mlir::Value cond) { - auto ifOp = - builder->create(toLocation(), cond, /*withElse=*/true); + Fortran::lower::pft::Evaluation &succ = *e.controlSuccessor; + bool hasElse = succ.isA() || + succ.isA(); + auto ifOp = builder->create(toLocation(), cond, + /*withElseRegion=*/hasElse); builder->setInsertionPointToStart(&ifOp.getThenRegion().front()); return ifOp; }; diff --git a/flang/test/HLFIR/assumed_shape_with_value_keyword.f90 b/flang/test/HLFIR/assumed_shape_with_value_keyword.f90 index 197efc08422c6e..208f22badda28d 100644 --- a/flang/test/HLFIR/assumed_shape_with_value_keyword.f90 +++ b/flang/test/HLFIR/assumed_shape_with_value_keyword.f90 @@ -102,7 +102,6 @@ subroutine test_optional1(x) ! CHECK: %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]]#0 : (!fir.box>) -> !fir.ref> ! CHECK: fir.call @_QPinternal_call7(%[[VAL_3]]) fastmath : (!fir.ref>) -> () ! CHECK: hlfir.copy_out %[[TMP_BOX]], %[[VAL_2]]#1 to %[[VAL_0]]#0 : (!fir.ref>>>, i1, !fir.box>) -> () -! CHECK: } else { ! CHECK: } ! CHECK: return ! CHECK: } @@ -122,7 +121,6 @@ subroutine test_optional2(x) ! CHECK: %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]]#0 : (!fir.box>) -> !fir.ref> ! CHECK: fir.call @_QPinternal_call8(%[[VAL_3]]) fastmath : (!fir.ref>) -> () ! CHECK: hlfir.copy_out %[[TMP_BOX]], %[[VAL_2]]#1 to %[[VAL_0]]#0 : (!fir.ref>>>, i1, !fir.box>) -> () -! CHECK: } else { ! CHECK: } ! CHECK: return ! CHECK: } diff --git a/flang/test/Lower/HLFIR/select-rank.f90 b/flang/test/Lower/HLFIR/select-rank.f90 index 211b7565bab8a3..d27a6d732ffc71 100644 --- a/flang/test/Lower/HLFIR/select-rank.f90 +++ b/flang/test/Lower/HLFIR/select-rank.f90 @@ -796,7 +796,6 @@ subroutine test_branching(x) ! CHECK: %[[VAL_10:.*]] = arith.xori %[[VAL_8]], %[[VAL_9]] : i1 ! CHECK: fir.if %[[VAL_10]] { ! CHECK: fir.call @_QPone() fastmath : () -> () -! CHECK: } else { ! CHECK: } ! CHECK: fir.call @_QPrdefault(%[[VAL_6]]#0) fastmath : (!fir.box>) -> () ! CHECK: cf.br ^bb7 diff --git a/flang/test/Lower/Intrinsics/system_clock.f90 b/flang/test/Lower/Intrinsics/system_clock.f90 index ca36920c04eb3b..9eae3a58884faf 100644 --- a/flang/test/Lower/Intrinsics/system_clock.f90 +++ b/flang/test/Lower/Intrinsics/system_clock.f90 @@ -104,7 +104,6 @@ subroutine ss(count) ! CHECK: fir.if %[[V_17]] { ! CHECK: %[[C_0:c[0-9a-z_]+]] = arith.constant 0 : i64 ! CHECK: fir.store %[[C_0]] to %arg0 : !fir.ref - ! CHECK: } else { ! CHECK: } ! CHECK: %[[V_18:[0-9]+]] = fir.zero_bits !fir.ptr ! CHECK: fir.store %[[V_18]] to %[[V_4]] : !fir.ref> @@ -137,7 +136,6 @@ subroutine ss(count) ! CHECK: %[[V_32]] = fir.load %arg0 : !fir.ref ! CHECK: %[[V_33]] = fir.call @_FortranAioOutputInteger64(%[[V_31]], %[[V_32]]) {{.*}}: (!fir.ref, i64) -> i1 ! CHECK: %[[V_34]] = fir.call @_FortranAioEndIoStatement(%[[V_31]]) {{.*}}: (!fir.ref) -> i32 - ! CHECK: } else { ! CHECK: } ! CHECK: return ! CHECK: } diff --git a/flang/test/Lower/OpenMP/master.f90 b/flang/test/Lower/OpenMP/master.f90 index 7db1be4f005b57..9f98ac89fb1fd9 100644 --- a/flang/test/Lower/OpenMP/master.f90 +++ b/flang/test/Lower/OpenMP/master.f90 @@ -91,7 +91,7 @@ subroutine omp_master_parallel() !CHECK: hlfir.assign %{{.*}} to %{{.*}}#0 : i32, !fir.ref beta = alpha + gama end if - !CHECK: else + !CHECK: } !CHECK: omp.terminator !$omp end master diff --git a/flang/test/Lower/OpenMP/unstructured.f90 b/flang/test/Lower/OpenMP/unstructured.f90 index 9c3527eda5bb43..bd030b918033e6 100644 --- a/flang/test/Lower/OpenMP/unstructured.f90 +++ b/flang/test/Lower/OpenMP/unstructured.f90 @@ -141,7 +141,6 @@ subroutine ss3(n) ! nested unstructured OpenMP constructs ! CHECK: @_FortranAioBeginExternalListOutput ! CHECK: %[[LOAD:.*]] = fir.load %[[OMP_LOOP_J_DECL]]#0 : !fir.ref ! CHECK: @_FortranAioOutputInteger32(%{{.*}}, %[[LOAD]]) -! CHECK: } else { ! CHECK: } ! CHECK-NEXT: omp.yield ! CHECK-NEXT: } diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 index 7e4890dd00fea3..56a43abca42a76 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 @@ -118,7 +118,6 @@ ! CHECK: %[[VAL_46:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_45]]) : (!fir.box>, i64) -> !fir.ref ! CHECK: %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref ! CHECK: hlfir.assign %[[VAL_47]] to %[[VAL_37]]#0 : f32, !fir.ref -! CHECK: } else { ! CHECK: } ! CHECK: omp.yield ! CHECK: omp.terminator diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 index 9a93c75f5bd1a8..775554fd3dcca1 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 @@ -108,7 +108,6 @@ ! CHECK: %[[VAL_46:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_45]]) : (!fir.box>, i64) -> !fir.ref ! CHECK: %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref ! CHECK: hlfir.assign %[[VAL_47]] to %[[VAL_37]]#0 : f32, !fir.ref -! CHECK: } else { ! CHECK: } ! CHECK: omp.yield ! CHECK: omp.terminator diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 index 41fcc979cdc9d9..d16de4a867a24c 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 @@ -120,7 +120,6 @@ ! CHECK: %[[VAL_46:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_45]]) : (!fir.box>, i64) -> !fir.ref ! CHECK: %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref ! CHECK: hlfir.assign %[[VAL_47]] to %[[VAL_37]]#0 : f32, !fir.ref -! CHECK: } else { ! CHECK: } ! CHECK: omp.yield ! CHECK: omp.terminator diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 index 50b2db9463d23d..04957c7287eae4 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 @@ -110,7 +110,6 @@ ! CHECK: %[[VAL_46:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_45]]) : (!fir.box>, i64) -> !fir.ref ! CHECK: %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref ! CHECK: hlfir.assign %[[VAL_47]] to %[[VAL_37]]#0 : f32, !fir.ref -! CHECK: } else { ! CHECK: } ! CHECK: omp.yield ! CHECK: omp.terminator diff --git a/flang/test/Lower/OpenMP/wsloop-variable.f90 b/flang/test/Lower/OpenMP/wsloop-variable.f90 index dc2acf881f482a..7bfb9274f389a3 100644 --- a/flang/test/Lower/OpenMP/wsloop-variable.f90 +++ b/flang/test/Lower/OpenMP/wsloop-variable.f90 @@ -190,7 +190,6 @@ subroutine wsloop_variable_sub !CHECK: %[[VAL_56:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref !CHECK: %[[VAL_57:.*]] = arith.cmpi eq, %[[VAL_55]], %[[VAL_56]] : i8 !CHECK: fir.if %[[VAL_57]] { -!CHECK: } else { !CHECK: } !CHECK: omp.yield !CHECK: } From 2a8fda443e71707e73607feda2af0dbc871c972f Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Fri, 30 Aug 2024 14:12:52 +0100 Subject: [PATCH 24/98] LICM: extend hoistAddSub to unsigned case (#106373) Trivially extend dd0cf23 ([LICM] Reassociate & hoist sub expressions) to handle unsigned predicates as well. Alive2 proofs: https://alive2.llvm.org/ce/z/GdDBtT. --- llvm/lib/Transforms/Scalar/LICM.cpp | 57 ++- llvm/test/Transforms/LICM/hoist-add-sub.ll | 496 ++++++++++++++++++++- 2 files changed, 533 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 526ae4e8834396..86c7dceffc5245 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -2537,14 +2537,19 @@ static bool hoistAdd(ICmpInst::Predicate Pred, Value *VariantLHS, Value *InvariantRHS, ICmpInst &ICmp, Loop &L, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU, AssumptionCache *AC, DominatorTree *DT) { - assert(ICmpInst::isSigned(Pred) && "Not supported yet!"); assert(!L.isLoopInvariant(VariantLHS) && "Precondition."); assert(L.isLoopInvariant(InvariantRHS) && "Precondition."); + bool IsSigned = ICmpInst::isSigned(Pred); + // Try to represent VariantLHS as sum of invariant and variant operands. using namespace PatternMatch; Value *VariantOp, *InvariantOp; - if (!match(VariantLHS, m_NSWAdd(m_Value(VariantOp), m_Value(InvariantOp)))) + if (IsSigned && + !match(VariantLHS, m_NSWAdd(m_Value(VariantOp), m_Value(InvariantOp)))) + return false; + if (!IsSigned && + !match(VariantLHS, m_NUWAdd(m_Value(VariantOp), m_Value(InvariantOp)))) return false; // LHS itself is a loop-variant, try to represent it in the form: @@ -2559,17 +2564,20 @@ static bool hoistAdd(ICmpInst::Predicate Pred, Value *VariantLHS, // normal linear arithmetics). Overflows make things much more complicated, so // we want to avoid this. auto &DL = L.getHeader()->getDataLayout(); - bool ProvedNoOverflowAfterReassociate = - computeOverflowForSignedSub(InvariantRHS, InvariantOp, - SimplifyQuery(DL, DT, AC, &ICmp)) == - llvm::OverflowResult::NeverOverflows; - if (!ProvedNoOverflowAfterReassociate) + SimplifyQuery SQ(DL, DT, AC, &ICmp); + if (IsSigned && computeOverflowForSignedSub(InvariantRHS, InvariantOp, SQ) != + llvm::OverflowResult::NeverOverflows) + return false; + if (!IsSigned && + computeOverflowForUnsignedSub(InvariantRHS, InvariantOp, SQ) != + llvm::OverflowResult::NeverOverflows) return false; auto *Preheader = L.getLoopPreheader(); assert(Preheader && "Loop is not in simplify form?"); IRBuilder<> Builder(Preheader->getTerminator()); - Value *NewCmpOp = Builder.CreateSub(InvariantRHS, InvariantOp, "invariant.op", - /*HasNUW*/ false, /*HasNSW*/ true); + Value *NewCmpOp = + Builder.CreateSub(InvariantRHS, InvariantOp, "invariant.op", + /*HasNUW*/ !IsSigned, /*HasNSW*/ IsSigned); ICmp.setPredicate(Pred); ICmp.setOperand(0, VariantOp); ICmp.setOperand(1, NewCmpOp); @@ -2584,14 +2592,19 @@ static bool hoistSub(ICmpInst::Predicate Pred, Value *VariantLHS, Value *InvariantRHS, ICmpInst &ICmp, Loop &L, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU, AssumptionCache *AC, DominatorTree *DT) { - assert(ICmpInst::isSigned(Pred) && "Not supported yet!"); assert(!L.isLoopInvariant(VariantLHS) && "Precondition."); assert(L.isLoopInvariant(InvariantRHS) && "Precondition."); + bool IsSigned = ICmpInst::isSigned(Pred); + // Try to represent VariantLHS as sum of invariant and variant operands. using namespace PatternMatch; Value *VariantOp, *InvariantOp; - if (!match(VariantLHS, m_NSWSub(m_Value(VariantOp), m_Value(InvariantOp)))) + if (IsSigned && + !match(VariantLHS, m_NSWSub(m_Value(VariantOp), m_Value(InvariantOp)))) + return false; + if (!IsSigned && + !match(VariantLHS, m_NUWSub(m_Value(VariantOp), m_Value(InvariantOp)))) return false; bool VariantSubtracted = false; @@ -2613,16 +2626,26 @@ static bool hoistSub(ICmpInst::Predicate Pred, Value *VariantLHS, // "C1 - C2" does not overflow. auto &DL = L.getHeader()->getDataLayout(); SimplifyQuery SQ(DL, DT, AC, &ICmp); - if (VariantSubtracted) { + if (VariantSubtracted && IsSigned) { // C1 - LV < C2 --> LV > C1 - C2 if (computeOverflowForSignedSub(InvariantOp, InvariantRHS, SQ) != llvm::OverflowResult::NeverOverflows) return false; - } else { + } else if (VariantSubtracted && !IsSigned) { + // C1 - LV < C2 --> LV > C1 - C2 + if (computeOverflowForUnsignedSub(InvariantOp, InvariantRHS, SQ) != + llvm::OverflowResult::NeverOverflows) + return false; + } else if (!VariantSubtracted && IsSigned) { // LV - C1 < C2 --> LV < C1 + C2 if (computeOverflowForSignedAdd(InvariantOp, InvariantRHS, SQ) != llvm::OverflowResult::NeverOverflows) return false; + } else { // !VariantSubtracted && !IsSigned + // LV - C1 < C2 --> LV < C1 + C2 + if (computeOverflowForUnsignedAdd(InvariantOp, InvariantRHS, SQ) != + llvm::OverflowResult::NeverOverflows) + return false; } auto *Preheader = L.getLoopPreheader(); assert(Preheader && "Loop is not in simplify form?"); @@ -2630,9 +2653,9 @@ static bool hoistSub(ICmpInst::Predicate Pred, Value *VariantLHS, Value *NewCmpOp = VariantSubtracted ? Builder.CreateSub(InvariantOp, InvariantRHS, "invariant.op", - /*HasNUW*/ false, /*HasNSW*/ true) + /*HasNUW*/ !IsSigned, /*HasNSW*/ IsSigned) : Builder.CreateAdd(InvariantOp, InvariantRHS, "invariant.op", - /*HasNUW*/ false, /*HasNSW*/ true); + /*HasNUW*/ !IsSigned, /*HasNSW*/ IsSigned); ICmp.setPredicate(Pred); ICmp.setOperand(0, VariantOp); ICmp.setOperand(1, NewCmpOp); @@ -2650,10 +2673,6 @@ static bool hoistAddSub(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo, if (!match(&I, m_ICmp(Pred, m_Value(LHS), m_Value(RHS)))) return false; - // TODO: Support unsigned predicates? - if (!ICmpInst::isSigned(Pred)) - return false; - // Put variant operand to LHS position. if (L.isLoopInvariant(LHS)) { std::swap(LHS, RHS); diff --git a/llvm/test/Transforms/LICM/hoist-add-sub.ll b/llvm/test/Transforms/LICM/hoist-add-sub.ll index 5393cdb1d29c43..d9b868eda579f9 100644 --- a/llvm/test/Transforms/LICM/hoist-add-sub.ll +++ b/llvm/test/Transforms/LICM/hoist-add-sub.ll @@ -51,6 +51,55 @@ out_of_bounds: ret i32 -1 } +define i32 @test_01_unsigned(ptr %p, ptr %x_p, ptr %length_p) { +; CHECK-LABEL: define i32 @test_01_unsigned +; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG1:![0-9]+]] +; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG2:![0-9]+]] +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = sub nuw i32 [[X]], 4 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[X_CHECK:%.*]] = icmp ugt i32 [[IV]], [[INVARIANT_OP]] +; CHECK-NEXT: br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ] +; CHECK-NEXT: ret i32 [[IV_NEXT_LCSSA]] +; CHECK: out_of_bounds: +; CHECK-NEXT: ret i32 -1 +; +entry: + %x = load i32, ptr %x_p, !range !2 + %length = load i32, ptr %length_p, !range !1 + br label %loop + +loop: + %iv = phi i32 [0, %entry], [%iv.next, %backedge] + %arith = sub nuw i32 %x, %iv + %x_check = icmp ult i32 %arith, 4 + br i1 %x_check, label %out_of_bounds, label %backedge + +backedge: + %el.ptr = getelementptr i32, ptr %p, i32 %iv + store i32 1, ptr %el.ptr + %iv.next = add nuw nsw i32 %iv, 4 + %loop_cond = icmp ult i32 %iv.next, %length + br i1 %loop_cond, label %loop, label %exit + +exit: + ret i32 %iv.next + +out_of_bounds: + ret i32 -1 +} + ; TODO: x - iv < 4 ==> iv > x - 4 define i32 @test_01a(ptr %p, ptr %x_p, ptr %length_p) { ; CHECK-LABEL: define i32 @test_01a @@ -114,6 +163,68 @@ failed: ret i32 -2 } +define i32 @test_01a_unsigned(ptr %p, ptr %x_p, ptr %length_p) { +; CHECK-LABEL: define i32 @test_01a_unsigned +; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4 +; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4 +; CHECK-NEXT: [[PRECOND_1:%.*]] = icmp uge i32 [[X]], 0 +; CHECK-NEXT: [[PRECOND_2:%.*]] = icmp uge i32 [[LENGTH]], 0 +; CHECK-NEXT: [[PRECOND:%.*]] = and i1 [[PRECOND_1]], [[PRECOND_2]] +; CHECK-NEXT: br i1 [[PRECOND]], label [[LOOP_PREHEADER:%.*]], label [[FAILED:%.*]] +; CHECK: loop.preheader: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[ARITH:%.*]] = sub nuw i32 [[X]], [[IV]] +; CHECK-NEXT: [[X_CHECK:%.*]] = icmp ult i32 [[ARITH]], 4 +; CHECK-NEXT: br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ] +; CHECK-NEXT: ret i32 [[IV_NEXT_LCSSA]] +; CHECK: out_of_bounds: +; CHECK-NEXT: ret i32 -1 +; CHECK: failed: +; CHECK-NEXT: ret i32 -2 +; +entry: + %x = load i32, ptr %x_p + %length = load i32, ptr %length_p + %precond_1 = icmp uge i32 %x, 0 + %precond_2 = icmp uge i32 %length, 0 + %precond = and i1 %precond_1, %precond_2 + br i1 %precond, label %loop, label %failed + +loop: + %iv = phi i32 [0, %entry], [%iv.next, %backedge] + %arith = sub nuw i32 %x, %iv + %x_check = icmp ult i32 %arith, 4 + br i1 %x_check, label %out_of_bounds, label %backedge + +backedge: + %el.ptr = getelementptr i32, ptr %p, i32 %iv + store i32 1, ptr %el.ptr + %iv.next = add nuw nsw i32 %iv, 4 + %loop_cond = icmp ult i32 %iv.next, %length + br i1 %loop_cond, label %loop, label %exit + +exit: + ret i32 %iv.next + +out_of_bounds: + ret i32 -1 + +failed: + ret i32 -2 +} + ; Range info is missing for x, cannot prove no-overflow. Should not hoist. define i32 @test_01_neg(ptr %p, ptr %x_p, ptr %length_p) { ; CHECK-LABEL: define i32 @test_01_neg @@ -164,6 +275,54 @@ out_of_bounds: ret i32 -1 } +define i32 @test_01_neg_unsigned(ptr %p, ptr %x_p, ptr %length_p) { +; CHECK-LABEL: define i32 @test_01_neg_unsigned +; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4 +; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG0]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[ARITH:%.*]] = sub nuw i32 [[X]], [[IV]] +; CHECK-NEXT: [[X_CHECK:%.*]] = icmp ult i32 [[ARITH]], 4 +; CHECK-NEXT: br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ] +; CHECK-NEXT: ret i32 [[IV_NEXT_LCSSA]] +; CHECK: out_of_bounds: +; CHECK-NEXT: ret i32 -1 +; +entry: + %x = load i32, ptr %x_p + %length = load i32, ptr %length_p, !range !0 + br label %loop + +loop: + %iv = phi i32 [0, %entry], [%iv.next, %backedge] + %arith = sub nuw i32 %x, %iv + %x_check = icmp ult i32 %arith, 4 + br i1 %x_check, label %out_of_bounds, label %backedge + +backedge: + %el.ptr = getelementptr i32, ptr %p, i32 %iv + store i32 1, ptr %el.ptr + %iv.next = add nuw nsw i32 %iv, 4 + %loop_cond = icmp ult i32 %iv.next, %length + br i1 %loop_cond, label %loop, label %exit + +exit: + ret i32 %iv.next + +out_of_bounds: + ret i32 -1 +} ; x + iv < 4 ==> iv < 4 - x define i32 @test_02(ptr %p, ptr %x_p, ptr %length_p) { @@ -215,6 +374,55 @@ out_of_bounds: ret i32 -1 } +define i32 @test_02_unsigned(ptr %p, ptr %x_p, ptr %length_p) { +; CHECK-LABEL: define i32 @test_02_unsigned +; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG3:![0-9]+]] +; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG2]] +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = sub nuw i32 4, [[X]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[X_CHECK:%.*]] = icmp ult i32 [[IV]], [[INVARIANT_OP]] +; CHECK-NEXT: br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ] +; CHECK-NEXT: ret i32 [[IV_NEXT_LCSSA]] +; CHECK: out_of_bounds: +; CHECK-NEXT: ret i32 -1 +; +entry: + %x = load i32, ptr %x_p, !range !3 + %length = load i32, ptr %length_p, !range !1 + br label %loop + +loop: + %iv = phi i32 [0, %entry], [%iv.next, %backedge] + %arith = add nuw i32 %x, %iv + %x_check = icmp ult i32 %arith, 4 + br i1 %x_check, label %out_of_bounds, label %backedge + +backedge: + %el.ptr = getelementptr i32, ptr %p, i32 %iv + store i32 1, ptr %el.ptr + %iv.next = add nuw nsw i32 %iv, 4 + %loop_cond = icmp ult i32 %iv.next, %length + br i1 %loop_cond, label %loop, label %exit + +exit: + ret i32 %iv.next + +out_of_bounds: + ret i32 -1 +} + ; TODO: x + iv < 4 ==> iv < 4 - x define i32 @test_02a(ptr %p, ptr %x_p, ptr %length_p) { ; CHECK-LABEL: define i32 @test_02a @@ -278,12 +486,74 @@ failed: ret i32 -2 } +define i32 @test_02a_unsigned(ptr %p, ptr %x_p, ptr %length_p) { +; CHECK-LABEL: define i32 @test_02a_unsigned +; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4 +; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4 +; CHECK-NEXT: [[PRECOND_1:%.*]] = icmp uge i32 [[X]], 0 +; CHECK-NEXT: [[PRECOND_2:%.*]] = icmp uge i32 [[LENGTH]], 0 +; CHECK-NEXT: [[PRECOND:%.*]] = and i1 [[PRECOND_1]], [[PRECOND_2]] +; CHECK-NEXT: br i1 [[PRECOND]], label [[LOOP_PREHEADER:%.*]], label [[FAILED:%.*]] +; CHECK: loop.preheader: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[ARITH:%.*]] = add nuw i32 [[X]], [[IV]] +; CHECK-NEXT: [[X_CHECK:%.*]] = icmp ult i32 [[ARITH]], 4 +; CHECK-NEXT: br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ] +; CHECK-NEXT: ret i32 [[IV_NEXT_LCSSA]] +; CHECK: out_of_bounds: +; CHECK-NEXT: ret i32 -1 +; CHECK: failed: +; CHECK-NEXT: ret i32 -2 +; +entry: + %x = load i32, ptr %x_p + %length = load i32, ptr %length_p + %precond_1 = icmp uge i32 %x, 0 + %precond_2 = icmp uge i32 %length, 0 + %precond = and i1 %precond_1, %precond_2 + br i1 %precond, label %loop, label %failed + +loop: + %iv = phi i32 [0, %entry], [%iv.next, %backedge] + %arith = add nuw i32 %x, %iv + %x_check = icmp ult i32 %arith, 4 + br i1 %x_check, label %out_of_bounds, label %backedge + +backedge: + %el.ptr = getelementptr i32, ptr %p, i32 %iv + store i32 1, ptr %el.ptr + %iv.next = add nuw nsw i32 %iv, 4 + %loop_cond = icmp ult i32 %iv.next, %length + br i1 %loop_cond, label %loop, label %exit + +exit: + ret i32 %iv.next + +out_of_bounds: + ret i32 -1 + +failed: + ret i32 -2 +} + ; iv - x < 4 ==> iv < 4 + x define i32 @test_03(ptr %p, ptr %x_p, ptr %length_p) { ; CHECK-LABEL: define i32 @test_03 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG1:![0-9]+]] +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG2]] ; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG0]] ; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add nsw i32 [[X]], 4 ; CHECK-NEXT: br label [[LOOP:%.*]] @@ -328,6 +598,55 @@ out_of_bounds: ret i32 -1 } +define i32 @test_03_unsigned(ptr %p, ptr %x_p, ptr %length_p) { +; CHECK-LABEL: define i32 @test_03_unsigned +; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG2]] +; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG2]] +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add nuw i32 [[X]], 4 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[X_CHECK:%.*]] = icmp ult i32 [[IV]], [[INVARIANT_OP]] +; CHECK-NEXT: br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ] +; CHECK-NEXT: ret i32 [[IV_NEXT_LCSSA]] +; CHECK: out_of_bounds: +; CHECK-NEXT: ret i32 -1 +; +entry: + %x = load i32, ptr %x_p, !range !1 + %length = load i32, ptr %length_p, !range !1 + br label %loop + +loop: + %iv = phi i32 [0, %entry], [%iv.next, %backedge] + %arith = sub nuw i32 %iv, %x + %x_check = icmp ult i32 %arith, 4 + br i1 %x_check, label %out_of_bounds, label %backedge + +backedge: + %el.ptr = getelementptr i32, ptr %p, i32 %iv + store i32 1, ptr %el.ptr + %iv.next = add nuw nsw i32 %iv, 4 + %loop_cond = icmp ult i32 %iv.next, %length + br i1 %loop_cond, label %loop, label %exit + +exit: + ret i32 %iv.next + +out_of_bounds: + ret i32 -1 +} + ; TODO: iv - x < 4 ==> iv < 4 + x define i32 @test_03a(ptr %p, ptr %x_p, ptr %length_p) { ; CHECK-LABEL: define i32 @test_03a @@ -391,6 +710,68 @@ failed: ret i32 -2 } +define i32 @test_03a_unsigned(ptr %p, ptr %x_p, ptr %length_p) { +; CHECK-LABEL: define i32 @test_03a_unsigned +; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4 +; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4 +; CHECK-NEXT: [[PRECOND_1:%.*]] = icmp ult i32 [[X]], 2147483640 +; CHECK-NEXT: [[PRECOND_2:%.*]] = icmp uge i32 [[LENGTH]], 0 +; CHECK-NEXT: [[PRECOND:%.*]] = and i1 [[PRECOND_1]], [[PRECOND_2]] +; CHECK-NEXT: br i1 [[PRECOND]], label [[LOOP_PREHEADER:%.*]], label [[FAILED:%.*]] +; CHECK: loop.preheader: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[ARITH:%.*]] = sub nuw i32 [[IV]], [[X]] +; CHECK-NEXT: [[X_CHECK:%.*]] = icmp ult i32 [[ARITH]], 4 +; CHECK-NEXT: br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ] +; CHECK-NEXT: ret i32 [[IV_NEXT_LCSSA]] +; CHECK: out_of_bounds: +; CHECK-NEXT: ret i32 -1 +; CHECK: failed: +; CHECK-NEXT: ret i32 -2 +; +entry: + %x = load i32, ptr %x_p + %length = load i32, ptr %length_p + %precond_1 = icmp ult i32 %x, 2147483640 + %precond_2 = icmp uge i32 %length, 0 + %precond = and i1 %precond_1, %precond_2 + br i1 %precond, label %loop, label %failed + +loop: + %iv = phi i32 [0, %entry], [%iv.next, %backedge] + %arith = sub nuw i32 %iv, %x + %x_check = icmp ult i32 %arith, 4 + br i1 %x_check, label %out_of_bounds, label %backedge + +backedge: + %el.ptr = getelementptr i32, ptr %p, i32 %iv + store i32 1, ptr %el.ptr + %iv.next = add nuw nsw i32 %iv, 4 + %loop_cond = icmp ult i32 %iv.next, %length + br i1 %loop_cond, label %loop, label %exit + +exit: + ret i32 %iv.next + +out_of_bounds: + ret i32 -1 + +failed: + ret i32 -2 +} + ; iv + x < 4 ==> iv < 4 - x define i32 @test_04(ptr %p, ptr %x_p, ptr %length_p) { ; CHECK-LABEL: define i32 @test_04 @@ -441,6 +822,55 @@ out_of_bounds: ret i32 -1 } +define i32 @test_04_unsigned(ptr %p, ptr %x_p, ptr %length_p) { +; CHECK-LABEL: define i32 @test_04_unsigned +; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG3]] +; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG2]] +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = sub nuw i32 4, [[X]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[X_CHECK:%.*]] = icmp ult i32 [[IV]], [[INVARIANT_OP]] +; CHECK-NEXT: br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ] +; CHECK-NEXT: ret i32 [[IV_NEXT_LCSSA]] +; CHECK: out_of_bounds: +; CHECK-NEXT: ret i32 -1 +; +entry: + %x = load i32, ptr %x_p, !range !3 + %length = load i32, ptr %length_p, !range !1 + br label %loop + +loop: + %iv = phi i32 [0, %entry], [%iv.next, %backedge] + %arith = add nuw i32 %iv, %x + %x_check = icmp ult i32 %arith, 4 + br i1 %x_check, label %out_of_bounds, label %backedge + +backedge: + %el.ptr = getelementptr i32, ptr %p, i32 %iv + store i32 1, ptr %el.ptr + %iv.next = add nuw nsw i32 %iv, 4 + %loop_cond = icmp ult i32 %iv.next, %length + br i1 %loop_cond, label %loop, label %exit + +exit: + ret i32 %iv.next + +out_of_bounds: + ret i32 -1 +} + ; TODO: iv + x < 4 ==> iv < 4 - x define i32 @test_04a(ptr %p, ptr %x_p, ptr %length_p) { ; CHECK-LABEL: define i32 @test_04a @@ -504,5 +934,69 @@ failed: ret i32 -2 } +define i32 @test_04a_unsigned(ptr %p, ptr %x_p, ptr %length_p) { +; CHECK-LABEL: define i32 @test_04a_unsigned +; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4 +; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4 +; CHECK-NEXT: [[PRECOND_1:%.*]] = icmp sge i32 [[X]], 0 +; CHECK-NEXT: [[PRECOND_2:%.*]] = icmp sge i32 [[LENGTH]], 0 +; CHECK-NEXT: [[PRECOND:%.*]] = and i1 [[PRECOND_1]], [[PRECOND_2]] +; CHECK-NEXT: br i1 [[PRECOND]], label [[LOOP_PREHEADER:%.*]], label [[FAILED:%.*]] +; CHECK: loop.preheader: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[ARITH:%.*]] = add nuw i32 [[IV]], [[X]] +; CHECK-NEXT: [[X_CHECK:%.*]] = icmp ult i32 [[ARITH]], 4 +; CHECK-NEXT: br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ] +; CHECK-NEXT: ret i32 [[IV_NEXT_LCSSA]] +; CHECK: out_of_bounds: +; CHECK-NEXT: ret i32 -1 +; CHECK: failed: +; CHECK-NEXT: ret i32 -2 +; +entry: + %x = load i32, ptr %x_p + %length = load i32, ptr %length_p + %precond_1 = icmp sge i32 %x, 0 + %precond_2 = icmp sge i32 %length, 0 + %precond = and i1 %precond_1, %precond_2 + br i1 %precond, label %loop, label %failed + +loop: + %iv = phi i32 [0, %entry], [%iv.next, %backedge] + %arith = add nuw i32 %iv, %x + %x_check = icmp ult i32 %arith, 4 + br i1 %x_check, label %out_of_bounds, label %backedge + +backedge: + %el.ptr = getelementptr i32, ptr %p, i32 %iv + store i32 1, ptr %el.ptr + %iv.next = add nuw nsw i32 %iv, 4 + %loop_cond = icmp ult i32 %iv.next, %length + br i1 %loop_cond, label %loop, label %exit + +exit: + ret i32 %iv.next + +out_of_bounds: + ret i32 -1 + +failed: + ret i32 -2 +} + !0 = !{i32 0, i32 2147483648} !1 = !{i32 0, i32 2147483640} +!2 = !{i32 256, i32 32768} +!3 = !{i32 0, i32 2} From 86a60e7f1e8f361f84ccb6e656e848dd4fbaa713 Mon Sep 17 00:00:00 2001 From: Patryk Wychowaniec Date: Fri, 30 Aug 2024 15:25:54 +0200 Subject: [PATCH 25/98] [AVR] Fix parsing & emitting relative jumps (#106722) Ever since 6859685a87ad093d60c8bed60b116143c0a684c7 (or, precisely, 84428dafc0941e3a31303fa1b286835ab2b8e234) relative jumps emitted by the AVR codegen are off by two bytes - this pull request fixes it. ## Abstract As compared to absolute jumps, relative jumps - such as rjmp, rcall or brsh - have an implied `pc+2` behavior; that is, `jmp 100` is `pc = 100`, but `rjmp 100` gets understood as `pc = pc + 100 + 2`. This is not reflected in the AVR codegen: https://github.com/llvm/llvm-project/blob/f95026dbf66e353128a3a3d7b55f3e52d5985535/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp#L89 ... which always emits relative jumps that are two bytes too far - or rather it _would_ emit such jumps if not for this check: https://github.com/llvm/llvm-project/blob/f95026dbf66e353128a3a3d7b55f3e52d5985535/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp#L517 ... which causes most of the relative jumps to be actually resolved late, by the linker, which applies the offsetting logic on its own, hiding the issue within LLVM. [Some time ago](https://github.com/llvm/llvm-project/commit/697a162fa63df328ec9ca334636c5e85390b2bf0) we've had a similar "jumps are off" problem that got solved by touching `shouldForceRelocation()`, but I think that has worked only by accident. It's exploited the fact that absolute vs relative jumps in the parsed assembly can be distinguished through a "side channel" check relying on the existence of labels (i.e. absolute jumps happen to named labels, but relative jumps are anonymous, so to say). This was an alright idea back then, but it got broken by 6859685a87ad093d60c8bed60b116143c0a684c7. I propose a different approach: - when emitting relative jumps, offset them by `-2` (well, `-1`, strictly speaking, because those instructions rely on right-shifted offset), - when parsing relative jumps, treat `.` as `+2` and read `rjmp .+1234` as `rjmp (1234 + 2)`. This approach seems to be sound and now we generate the same assembly as avr-gcc, which can be confirmed with: ```cpp // avr-gcc test.c -O3 && avr-objdump -d a.out int main() { asm( " foo:\n\t" " rjmp .+2\n\t" " rjmp .-2\n\t" " rjmp foo\n\t" " rjmp .+8\n\t" " rjmp end\n\t" " rjmp .+0\n\t" " end:\n\t" " rjmp .-4\n\t" " rjmp .-6\n\t" " x:\n\t" " rjmp x\n\t" " .short 0xc00f\n\t" ); } ``` avr-gcc is also how I got the opcodes for all new tests like `inst-brbc.s`, so we should be good. --- .../lib/Target/AVR/AsmParser/AVRAsmParser.cpp | 15 +- .../Target/AVR/MCTargetDesc/AVRAsmBackend.cpp | 12 +- llvm/test/CodeGen/AVR/jmp.ll | 25 ++ llvm/test/MC/AVR/inst-brbc.s | 23 +- llvm/test/MC/AVR/inst-brbs.s | 22 +- llvm/test/MC/AVR/inst-brcc.s | 28 ++ llvm/test/MC/AVR/inst-brcs.s | 28 ++ llvm/test/MC/AVR/inst-breq.s | 28 ++ llvm/test/MC/AVR/inst-brge.s | 24 ++ llvm/test/MC/AVR/inst-brhc.s | 24 ++ llvm/test/MC/AVR/inst-brhs.s | 24 ++ llvm/test/MC/AVR/inst-brid.s | 24 ++ llvm/test/MC/AVR/inst-brie.s | 24 ++ llvm/test/MC/AVR/inst-brlo.s | 24 ++ llvm/test/MC/AVR/inst-brlt.s | 24 ++ llvm/test/MC/AVR/inst-brmi.s | 24 ++ llvm/test/MC/AVR/inst-brne.s | 28 ++ llvm/test/MC/AVR/inst-brpl.s | 24 ++ llvm/test/MC/AVR/inst-brsh.s | 24 ++ llvm/test/MC/AVR/inst-brtc.s | 24 ++ llvm/test/MC/AVR/inst-brts.s | 24 ++ llvm/test/MC/AVR/inst-brvc.s | 24 ++ llvm/test/MC/AVR/inst-brvs.s | 24 ++ llvm/test/MC/AVR/inst-family-cond-branch.s | 321 ------------------ llvm/test/MC/AVR/inst-rcall.s | 33 +- llvm/test/MC/AVR/inst-rjmp.s | 69 ++-- 26 files changed, 567 insertions(+), 401 deletions(-) create mode 100644 llvm/test/CodeGen/AVR/jmp.ll create mode 100644 llvm/test/MC/AVR/inst-brcc.s create mode 100644 llvm/test/MC/AVR/inst-brcs.s create mode 100644 llvm/test/MC/AVR/inst-breq.s create mode 100644 llvm/test/MC/AVR/inst-brge.s create mode 100644 llvm/test/MC/AVR/inst-brhc.s create mode 100644 llvm/test/MC/AVR/inst-brhs.s create mode 100644 llvm/test/MC/AVR/inst-brid.s create mode 100644 llvm/test/MC/AVR/inst-brie.s create mode 100644 llvm/test/MC/AVR/inst-brlo.s create mode 100644 llvm/test/MC/AVR/inst-brlt.s create mode 100644 llvm/test/MC/AVR/inst-brmi.s create mode 100644 llvm/test/MC/AVR/inst-brne.s create mode 100644 llvm/test/MC/AVR/inst-brpl.s create mode 100644 llvm/test/MC/AVR/inst-brsh.s create mode 100644 llvm/test/MC/AVR/inst-brtc.s create mode 100644 llvm/test/MC/AVR/inst-brts.s create mode 100644 llvm/test/MC/AVR/inst-brvc.s create mode 100644 llvm/test/MC/AVR/inst-brvs.s delete mode 100644 llvm/test/MC/AVR/inst-family-cond-branch.s diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp index 383dfcc31117c1..c016b2dd91dc67 100644 --- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp +++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp @@ -72,7 +72,7 @@ class AVRAsmParser : public MCTargetAsmParser { int parseRegisterName(); int parseRegister(bool RestoreOnFailure = false); bool tryParseRegisterOperand(OperandVector &Operands); - bool tryParseExpression(OperandVector &Operands); + bool tryParseExpression(OperandVector &Operands, int64_t offset); bool tryParseRelocExpression(OperandVector &Operands); void eatComma(); @@ -418,7 +418,7 @@ bool AVRAsmParser::tryParseRegisterOperand(OperandVector &Operands) { return false; } -bool AVRAsmParser::tryParseExpression(OperandVector &Operands) { +bool AVRAsmParser::tryParseExpression(OperandVector &Operands, int64_t offset) { SMLoc S = Parser.getTok().getLoc(); if (!tryParseRelocExpression(Operands)) @@ -437,6 +437,11 @@ bool AVRAsmParser::tryParseExpression(OperandVector &Operands) { if (getParser().parseExpression(Expression)) return true; + if (offset) { + Expression = MCBinaryExpr::createAdd( + Expression, MCConstantExpr::create(offset, getContext()), getContext()); + } + SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); Operands.push_back(AVROperand::CreateImm(Expression, S, E)); return false; @@ -529,8 +534,9 @@ bool AVRAsmParser::parseOperand(OperandVector &Operands, bool maybeReg) { [[fallthrough]]; case AsmToken::LParen: case AsmToken::Integer: + return tryParseExpression(Operands, 0); case AsmToken::Dot: - return tryParseExpression(Operands); + return tryParseExpression(Operands, 2); case AsmToken::Plus: case AsmToken::Minus: { // If the sign preceeds a number, parse the number, @@ -540,7 +546,7 @@ bool AVRAsmParser::parseOperand(OperandVector &Operands, bool maybeReg) { case AsmToken::BigNum: case AsmToken::Identifier: case AsmToken::Real: - if (!tryParseExpression(Operands)) + if (!tryParseExpression(Operands, 0)) return false; break; default: @@ -643,6 +649,7 @@ bool AVRAsmParser::ParseInstruction(ParseInstructionInfo &Info, // These specific operands should be treated as addresses/symbols/labels, // other than registers. bool maybeReg = true; + if (OperandNum == 1) { std::array Insts = {"lds", "adiw", "sbiw", "ldi"}; for (auto Inst : Insts) { diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp index 0d29912bee2646..388d58a82214d1 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp @@ -94,6 +94,9 @@ static void adjustRelativeBranch(unsigned Size, const MCFixup &Fixup, // Rightshifts the value by one. AVR::fixups::adjustBranchTarget(Value); + + // Jumps are relative to the current instruction. + Value -= 1; } /// 22-bit absolute fixup. @@ -513,15 +516,10 @@ bool AVRAsmBackend::shouldForceRelocation(const MCAssembler &Asm, switch ((unsigned)Fixup.getKind()) { default: return Fixup.getKind() >= FirstLiteralRelocationKind; - // Fixups which should always be recorded as relocations. case AVR::fixup_7_pcrel: case AVR::fixup_13_pcrel: - // Do not force relocation for PC relative branch like 'rjmp .', - // 'rcall . - off' and 'breq . + off'. - if (const auto *SymA = Target.getSymA()) - if (SymA->getSymbol().getName().size() == 0) - return false; - [[fallthrough]]; + // Always resolve relocations for PC-relative branches + return false; case AVR::fixup_call: return true; } diff --git a/llvm/test/CodeGen/AVR/jmp.ll b/llvm/test/CodeGen/AVR/jmp.ll new file mode 100644 index 00000000000000..95dfff4836b4e8 --- /dev/null +++ b/llvm/test/CodeGen/AVR/jmp.ll @@ -0,0 +1,25 @@ +; RUN: llc -filetype=obj -mtriple=avr < %s | llvm-objdump -dr --no-show-raw-insn - | FileCheck %s + +define i8 @foo(i8 %a) { +bb0: + %0 = tail call i8 @bar(i8 %a) + %1 = icmp eq i8 %0, 123 + br i1 %1, label %bb1, label %bb2 + +bb1: + ret i8 100 + +bb2: + ret i8 200 +} + +declare i8 @bar(i8); + +; CHECK: rcall .-2 +; CHECK-NEXT: 00000000: R_AVR_13_PCREL bar +; CHECK-NEXT: cpi r24, 0x7b +; CHECK-NEXT: brne .+4 +; CHECK-NEXT: ldi r24, 0x64 +; CHECK-NEXT: ret +; CHECK-NEXT: ldi r24, 0xc8 +; CHECK-NEXT: ret diff --git a/llvm/test/MC/AVR/inst-brbc.s b/llvm/test/MC/AVR/inst-brbc.s index 4d7d684da4468a..3ef3664cf07bfc 100644 --- a/llvm/test/MC/AVR/inst-brbc.s +++ b/llvm/test/MC/AVR/inst-brbc.s @@ -3,7 +3,6 @@ ; RUN: | llvm-objdump -d - | FileCheck --check-prefix=INST %s foo: - brbc 3, .+8 brbc 0, .-16 .short 0xf759 @@ -11,14 +10,16 @@ foo: .short 0xf74c .short 0xf4c7 -; CHECK: brvc .Ltmp0+8 ; encoding: [0bAAAAA011,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp0+8, kind: fixup_7_pcrel -; CHECK: brcc .Ltmp1-16 ; encoding: [0bAAAAA000,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp1-16, kind: fixup_7_pcrel +; CHECK: brvc (.Ltmp0+8)+2 ; encoding: [0bAAAAA011,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+8)+2, kind: fixup_7_pcrel +; +; CHECK: brcc (.Ltmp1-16)+2 ; encoding: [0bAAAAA000,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1-16)+2, kind: fixup_7_pcrel -; INST: 23 f4 brvc .+8 -; INST: c0 f7 brsh .-16 -; INST: 59 f7 brne .-42 -; INST: 52 f7 brpl .-44 -; INST: 4c f7 brge .-46 -; INST: c7 f4 brid .+48 +; INST-LABEL: : +; INST-NEXT: 23 f4 brvc .+8 +; INST-NEXT: c0 f7 brsh .-16 +; INST-NEXT: 59 f7 brne .-42 +; INST-NEXT: 52 f7 brpl .-44 +; INST-NEXT: 4c f7 brge .-46 +; INST-NEXT: c7 f4 brid .+48 diff --git a/llvm/test/MC/AVR/inst-brbs.s b/llvm/test/MC/AVR/inst-brbs.s index 7987feeec654a1..f15a779a53654f 100644 --- a/llvm/test/MC/AVR/inst-brbs.s +++ b/llvm/test/MC/AVR/inst-brbs.s @@ -3,7 +3,6 @@ ; RUN: | llvm-objdump -d - | FileCheck --check-prefix=INST %s foo: - brbs 3, .+8 brbs 0, .-12 .short 0xf359 @@ -11,14 +10,15 @@ foo: .short 0xf34c .short 0xf077 -; CHECK: brvs .Ltmp0+8 ; encoding: [0bAAAAA011,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp0+8, kind: fixup_7_pcrel -; CHECK: brcs .Ltmp1-12 ; encoding: [0bAAAAA000,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp1-12, kind: fixup_7_pcrel +; CHECK: brvs (.Ltmp0+8)+2 ; encoding: [0bAAAAA011,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+8)+2, kind: fixup_7_pcrel +; CHECK: brcs (.Ltmp1-12)+2 ; encoding: [0bAAAAA000,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1-12)+2, kind: fixup_7_pcrel -; INST: 23 f0 brvs .+8 -; INST: d0 f3 brlo .-12 -; INST: 59 f3 breq .-42 -; INST: 52 f3 brmi .-44 -; INST: 4c f3 brlt .-46 -; INST: 77 f0 brie .+28 +; INST-LABEL: : +; INST-NEXT: 23 f0 brvs .+8 +; INST-NEXT: d0 f3 brlo .-12 +; INST-NEXT: 59 f3 breq .-42 +; INST-NEXT: 52 f3 brmi .-44 +; INST-NEXT: 4c f3 brlt .-46 +; INST-NEXT: 77 f0 brie .+28 diff --git a/llvm/test/MC/AVR/inst-brcc.s b/llvm/test/MC/AVR/inst-brcc.s new file mode 100644 index 00000000000000..d9218bc61e787f --- /dev/null +++ b/llvm/test/MC/AVR/inst-brcc.s @@ -0,0 +1,28 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brcc .+66 + brcc .-22 + brbc 0, .+66 + brbc 0, bar + +bar: + +; CHECK: brcc (.Ltmp0+66)+2 ; encoding: [0bAAAAA000,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+66)+2, kind: fixup_7_pcrel +; CHECK: brcc (.Ltmp1-22)+2 ; encoding: [0bAAAAA000,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1-22)+2, kind: fixup_7_pcrel +; CHECK: brcc (.Ltmp2+66)+2 ; encoding: [0bAAAAA000,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp2+66)+2, kind: fixup_7_pcrel +; CHECK: brcc bar ; encoding: [0bAAAAA000,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: 08 f5 brsh .+66 +; INST-NEXT: a8 f7 brsh .-22 +; INST-NEXT: 08 f5 brsh .+66 +; INST-NEXT: 00 f4 brsh .+0 diff --git a/llvm/test/MC/AVR/inst-brcs.s b/llvm/test/MC/AVR/inst-brcs.s new file mode 100644 index 00000000000000..0012cb31f61269 --- /dev/null +++ b/llvm/test/MC/AVR/inst-brcs.s @@ -0,0 +1,28 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brcs .+8 + brcs .+4 + brbs 0, .+8 + brbs 0, bar + +bar: + +; CHECK: brcs (.Ltmp0+8)+2 ; encoding: [0bAAAAA000,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+8)+2, kind: fixup_7_pcrel +; CHECK: brcs (.Ltmp1+4)+2 ; encoding: [0bAAAAA000,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+4)+2, kind: fixup_7_pcrel +; CHECK: brcs (.Ltmp2+8)+2 ; encoding: [0bAAAAA000,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp2+8)+2, kind: fixup_7_pcrel +; CHECK: brcs bar ; encoding: [0bAAAAA000,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: 20 f0 brlo .+8 +; INST-NEXT: 10 f0 brlo .+4 +; INST-NEXT: 20 f0 brlo .+8 +; INST-NEXT: 00 f0 brlo .+0 diff --git a/llvm/test/MC/AVR/inst-breq.s b/llvm/test/MC/AVR/inst-breq.s new file mode 100644 index 00000000000000..f82010f02ba617 --- /dev/null +++ b/llvm/test/MC/AVR/inst-breq.s @@ -0,0 +1,28 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + breq .-18 + breq .-12 + brbs 1, .-18 + brbs 1, bar + +bar: + +; CHECK: breq (.Ltmp0-18)+2 ; encoding: [0bAAAAA001,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0-18)+2, kind: fixup_7_pcrel +; CHECK: breq (.Ltmp1-12)+2 ; encoding: [0bAAAAA001,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1-12)+2, kind: fixup_7_pcrel +; CHECK: brbs 1, (.Ltmp2-18)+2 ; encoding: [0bAAAAA001,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp2-18)+2, kind: fixup_7_pcrel +; CHECK: brbs 1, bar ; encoding: [0bAAAAA001,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: b9 f3 breq .-18 +; INST-NEXT: d1 f3 breq .-12 +; INST-NEXT: b9 f3 breq .-18 +; INST-NEXT: 01 f0 breq .+0 diff --git a/llvm/test/MC/AVR/inst-brge.s b/llvm/test/MC/AVR/inst-brge.s new file mode 100644 index 00000000000000..1121284a114689 --- /dev/null +++ b/llvm/test/MC/AVR/inst-brge.s @@ -0,0 +1,24 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brge .+50 + brge .+42 + brge bar + +bar: + +; CHECK: brge (.Ltmp0+50)+2 ; encoding: [0bAAAAA100,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+50)+2, kind: fixup_7_pcrel +; CHECK: brge (.Ltmp1+42)+2 ; encoding: [0bAAAAA100,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+42)+2, kind: fixup_7_pcrel +; CHECK: brge bar ; encoding: [0bAAAAA100,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: cc f4 brge .+50 +; INST-NEXT: ac f4 brge .+42 +; INST-NEXT: 04 f4 brge .+0 diff --git a/llvm/test/MC/AVR/inst-brhc.s b/llvm/test/MC/AVR/inst-brhc.s new file mode 100644 index 00000000000000..eb16ac2ef7a64e --- /dev/null +++ b/llvm/test/MC/AVR/inst-brhc.s @@ -0,0 +1,24 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brhc .+12 + brhc .+14 + brhc bar + +bar: + +; CHECK: brhc (.Ltmp0+12)+2 ; encoding: [0bAAAAA101,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+12)+2, kind: fixup_7_pcrel +; CHECK: brhc (.Ltmp1+14)+2 ; encoding: [0bAAAAA101,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+14)+2, kind: fixup_7_pcrel +; CHECK: brhc bar ; encoding: [0bAAAAA101,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: 35 f4 brhc .+12 +; INST-NEXT: 3d f4 brhc .+14 +; INST-NEXT: 05 f4 brhc .+0 diff --git a/llvm/test/MC/AVR/inst-brhs.s b/llvm/test/MC/AVR/inst-brhs.s new file mode 100644 index 00000000000000..77c49596b3b0b8 --- /dev/null +++ b/llvm/test/MC/AVR/inst-brhs.s @@ -0,0 +1,24 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brhs .-66 + brhs .+14 + brhs bar + +bar: + +; CHECK: brhs (.Ltmp0-66)+2 ; encoding: [0bAAAAA101,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0-66)+2, kind: fixup_7_pcrel +; CHECK: brhs (.Ltmp1+14)+2 ; encoding: [0bAAAAA101,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+14)+2, kind: fixup_7_pcrel +; CHECK: brhs bar ; encoding: [0bAAAAA101,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: fd f2 brhs .-66 +; INST-NEXT: 3d f0 brhs .+14 +; INST-NEXT: 05 f0 brhs .+0 diff --git a/llvm/test/MC/AVR/inst-brid.s b/llvm/test/MC/AVR/inst-brid.s new file mode 100644 index 00000000000000..70d0ea83c49b2a --- /dev/null +++ b/llvm/test/MC/AVR/inst-brid.s @@ -0,0 +1,24 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brid .+42 + brid .+62 + brid bar + +bar: + +; CHECK: brid (.Ltmp0+42)+2 ; encoding: [0bAAAAA111,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+42)+2, kind: fixup_7_pcrel +; CHECK: brid (.Ltmp1+62)+2 ; encoding: [0bAAAAA111,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+62)+2, kind: fixup_7_pcrel +; CHECK: brid bar ; encoding: [0bAAAAA111,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: af f4 brid .+42 +; INST-NEXT: ff f4 brid .+62 +; INST-NEXT: 07 f4 brid .+0 diff --git a/llvm/test/MC/AVR/inst-brie.s b/llvm/test/MC/AVR/inst-brie.s new file mode 100644 index 00000000000000..717c686e2ed44e --- /dev/null +++ b/llvm/test/MC/AVR/inst-brie.s @@ -0,0 +1,24 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brie .+20 + brie .+40 + brie bar + +bar: + +; CHECK: brie (.Ltmp0+20)+2 ; encoding: [0bAAAAA111,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+20)+2, kind: fixup_7_pcrel +; CHECK: brie (.Ltmp1+40)+2 ; encoding: [0bAAAAA111,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+40)+2, kind: fixup_7_pcrel +; CHECK: brie bar ; encoding: [0bAAAAA111,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: 57 f0 brie .+20 +; INST-NEXT: a7 f0 brie .+40 +; INST-NEXT: 07 f0 brie .+0 diff --git a/llvm/test/MC/AVR/inst-brlo.s b/llvm/test/MC/AVR/inst-brlo.s new file mode 100644 index 00000000000000..4b56d66ffdfe00 --- /dev/null +++ b/llvm/test/MC/AVR/inst-brlo.s @@ -0,0 +1,24 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brlo .+12 + brlo .+28 + brlo bar + +bar: + +; CHECK: brlo (.Ltmp0+12)+2 ; encoding: [0bAAAAA000,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+12)+2, kind: fixup_7_pcrel +; CHECK: brlo (.Ltmp1+28)+2 ; encoding: [0bAAAAA000,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+28)+2, kind: fixup_7_pcrel +; CHECK: brlo bar ; encoding: [0bAAAAA000,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: 30 f0 brlo .+12 +; INST-NEXT: 70 f0 brlo .+28 +; INST-NEXT: 00 f0 brlo .+0 diff --git a/llvm/test/MC/AVR/inst-brlt.s b/llvm/test/MC/AVR/inst-brlt.s new file mode 100644 index 00000000000000..8a7c543f9444b1 --- /dev/null +++ b/llvm/test/MC/AVR/inst-brlt.s @@ -0,0 +1,24 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brlt .+16 + brlt .+2 + brlt bar + +bar: + +; CHECK: brlt (.Ltmp0+16)+2 ; encoding: [0bAAAAA100,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+16)+2, kind: fixup_7_pcrel +; CHECK: brlt (.Ltmp1+2)+2 ; encoding: [0bAAAAA100,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+2)+2, kind: fixup_7_pcrel +; CHECK: brlt bar ; encoding: [0bAAAAA100,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: 44 f0 brlt .+16 +; INST-NEXT: 0c f0 brlt .+2 +; INST-NEXT: 04 f0 brlt .+0 diff --git a/llvm/test/MC/AVR/inst-brmi.s b/llvm/test/MC/AVR/inst-brmi.s new file mode 100644 index 00000000000000..878612d294dd95 --- /dev/null +++ b/llvm/test/MC/AVR/inst-brmi.s @@ -0,0 +1,24 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brmi .+66 + brmi .+58 + brmi bar + +bar: + +; CHECK: brmi (.Ltmp0+66)+2 ; encoding: [0bAAAAA010,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+66)+2, kind: fixup_7_pcrel +; CHECK: brmi (.Ltmp1+58)+2 ; encoding: [0bAAAAA010,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+58)+2, kind: fixup_7_pcrel +; CHECK: brmi bar ; encoding: [0bAAAAA010,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: 0a f1 brmi .+66 +; INST-NEXT: ea f0 brmi .+58 +; INST-NEXT: 02 f0 brmi .+0 diff --git a/llvm/test/MC/AVR/inst-brne.s b/llvm/test/MC/AVR/inst-brne.s new file mode 100644 index 00000000000000..9d6bee4b754d95 --- /dev/null +++ b/llvm/test/MC/AVR/inst-brne.s @@ -0,0 +1,28 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brne .+10 + brne .+2 + brbc 1, .+10 + brbc 1, bar + +bar: + +; CHECK: brne (.Ltmp0+10)+2 ; encoding: [0bAAAAA001,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+10)+2, kind: fixup_7_pcrel +; CHECK: brne (.Ltmp1+2)+2 ; encoding: [0bAAAAA001,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+2)+2, kind: fixup_7_pcrel +; CHECK: brbc 1, (.Ltmp2+10)+2 ; encoding: [0bAAAAA001,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp2+10)+2, kind: fixup_7_pcrel +; CHECK: brbc 1, bar ; encoding: [0bAAAAA001,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: 29 f4 brne .+10 +; INST-NEXT: 09 f4 brne .+2 +; INST-NEXT: 29 f4 brne .+10 +; INST-NEXT: 01 f4 brne .+0 diff --git a/llvm/test/MC/AVR/inst-brpl.s b/llvm/test/MC/AVR/inst-brpl.s new file mode 100644 index 00000000000000..393365ee35339e --- /dev/null +++ b/llvm/test/MC/AVR/inst-brpl.s @@ -0,0 +1,24 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brpl .-12 + brpl .+18 + brpl bar + +bar: + +; CHECK: brpl (.Ltmp0-12)+2 ; encoding: [0bAAAAA010,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0-12)+2, kind: fixup_7_pcrel +; CHECK: brpl (.Ltmp1+18)+2 ; encoding: [0bAAAAA010,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+18)+2, kind: fixup_7_pcrel +; CHECK: brpl bar ; encoding: [0bAAAAA010,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: d2 f7 brpl .-12 +; INST-NEXT: 4a f4 brpl .+18 +; INST-NEXT: 02 f4 brpl .+0 diff --git a/llvm/test/MC/AVR/inst-brsh.s b/llvm/test/MC/AVR/inst-brsh.s new file mode 100644 index 00000000000000..0bacd64d3d8d05 --- /dev/null +++ b/llvm/test/MC/AVR/inst-brsh.s @@ -0,0 +1,24 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brsh .+32 + brsh .+70 + brsh bar + +bar: + +; CHECK: brsh (.Ltmp0+32)+2 ; encoding: [0bAAAAA000,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+32)+2, kind: fixup_7_pcrel +; CHECK: brsh (.Ltmp1+70)+2 ; encoding: [0bAAAAA000,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+70)+2, kind: fixup_7_pcrel +; CHECK: brsh bar ; encoding: [0bAAAAA000,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: 80 f4 brsh .+32 +; INST-NEXT: 18 f5 brsh .+70 +; INST-NEXT: 00 f4 brsh .+0 diff --git a/llvm/test/MC/AVR/inst-brtc.s b/llvm/test/MC/AVR/inst-brtc.s new file mode 100644 index 00000000000000..eb4ee211628721 --- /dev/null +++ b/llvm/test/MC/AVR/inst-brtc.s @@ -0,0 +1,24 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brtc .+52 + brtc .+50 + brtc bar + +bar: + +; CHECK: brtc (.Ltmp0+52)+2 ; encoding: [0bAAAAA110,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+52)+2, kind: fixup_7_pcrel +; CHECK: brtc (.Ltmp1+50)+2 ; encoding: [0bAAAAA110,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+50)+2, kind: fixup_7_pcrel +; CHECK: brtc bar ; encoding: [0bAAAAA110,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: d6 f4 brtc .+52 +; INST-NEXT: ce f4 brtc .+50 +; INST-NEXT: 06 f4 brtc .+0 diff --git a/llvm/test/MC/AVR/inst-brts.s b/llvm/test/MC/AVR/inst-brts.s new file mode 100644 index 00000000000000..ccd794a9225894 --- /dev/null +++ b/llvm/test/MC/AVR/inst-brts.s @@ -0,0 +1,24 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brts .+18 + brts .+22 + brts bar + +bar: + +; CHECK: brts (.Ltmp0+18)+2 ; encoding: [0bAAAAA110,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+18)+2, kind: fixup_7_pcrel +; CHECK: brts (.Ltmp1+22)+2 ; encoding: [0bAAAAA110,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+22)+2, kind: fixup_7_pcrel +; CHECK: brts bar ; encoding: [0bAAAAA110,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: 4e f0 brts .+18 +; INST-NEXT: 5e f0 brts .+22 +; INST-NEXT: 06 f0 brts .+0 diff --git a/llvm/test/MC/AVR/inst-brvc.s b/llvm/test/MC/AVR/inst-brvc.s new file mode 100644 index 00000000000000..573f779c0dcd6a --- /dev/null +++ b/llvm/test/MC/AVR/inst-brvc.s @@ -0,0 +1,24 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brvc .-28 + brvc .-62 + brvc bar + +bar: + +; CHECK: brvc (.Ltmp0-28)+2 ; encoding: [0bAAAAA011,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0-28)+2, kind: fixup_7_pcrel +; CHECK: brvc (.Ltmp1-62)+2 ; encoding: [0bAAAAA011,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1-62)+2, kind: fixup_7_pcrel +; CHECK: brvc bar ; encoding: [0bAAAAA011,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: 93 f7 brvc .-28 +; INST-NEXT: 0b f7 brvc .-62 +; INST-NEXT: 03 f4 brvc .+0 diff --git a/llvm/test/MC/AVR/inst-brvs.s b/llvm/test/MC/AVR/inst-brvs.s new file mode 100644 index 00000000000000..d50a1a9ec5b62f --- /dev/null +++ b/llvm/test/MC/AVR/inst-brvs.s @@ -0,0 +1,24 @@ +; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; +; RUN: llvm-mc -filetype=obj -triple avr < %s \ +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s + +foo: + brvs .+18 + brvs .+32 + brvs bar + +bar: + +; CHECK: brvs (.Ltmp0+18)+2 ; encoding: [0bAAAAA011,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+18)+2, kind: fixup_7_pcrel +; CHECK: brvs (.Ltmp1+32)+2 ; encoding: [0bAAAAA011,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+32)+2, kind: fixup_7_pcrel +; CHECK: brvs bar ; encoding: [0bAAAAA011,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel + +; INST-LABEL: : +; INST-NEXT: 4b f0 brvs .+18 +; INST-NEXT: 83 f0 brvs .+32 +; INST-NEXT: 03 f0 brvs .+0 diff --git a/llvm/test/MC/AVR/inst-family-cond-branch.s b/llvm/test/MC/AVR/inst-family-cond-branch.s deleted file mode 100644 index dc36425a884f3b..00000000000000 --- a/llvm/test/MC/AVR/inst-family-cond-branch.s +++ /dev/null @@ -1,321 +0,0 @@ -; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s -; RUN: llvm-mc -filetype=obj -triple avr < %s \ -; RUN: | llvm-objdump -d - | FileCheck --check-prefix=INST %s - - -foo: - ; BREQ - breq .-18 - breq .-12 - brbs 1, .-18 - brbs 1, baz - -; CHECK: breq .Ltmp0-18 ; encoding: [0bAAAAA001,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp0-18, kind: fixup_7_pcrel -; CHECK: breq .Ltmp1-12 ; encoding: [0bAAAAA001,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp1-12, kind: fixup_7_pcrel -; CHECK: brbs 1, .Ltmp2-18 ; encoding: [0bAAAAA001,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp2-18, kind: fixup_7_pcrel -; CHECK: brbs 1, baz ; encoding: [0bAAAAA001,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: baz, kind: fixup_7_pcrel - -; INST-LABEL: : -; INST: breq .-18 -; INST: breq .-12 -; INST: breq .-18 -; INST: breq .+0 - - ; BRNE - brne .+10 - brne .+2 - brbc 1, .+10 - brbc 1, bar - -; CHECK: brne .Ltmp3+10 ; encoding: [0bAAAAA001,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp3+10, kind: fixup_7_pcrel -; CHECK: brne .Ltmp4+2 ; encoding: [0bAAAAA001,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp4+2, kind: fixup_7_pcrel -; CHECK: brbc 1, .Ltmp5+10 ; encoding: [0bAAAAA001,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp5+10, kind: fixup_7_pcrel -; CHECK: brbc 1, bar ; encoding: [0bAAAAA001,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel - -; INST: brne .+10 -; INST: brne .+2 -; INST: brne .+10 -; INST: brne .+0 - -bar: - ; BRCS - brcs .+8 - brcs .+4 - brbs 0, .+8 - brbs 0, end - -; CHECK: brcs .Ltmp6+8 ; encoding: [0bAAAAA000,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp6+8, kind: fixup_7_pcrel -; CHECK: brcs .Ltmp7+4 ; encoding: [0bAAAAA000,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp7+4, kind: fixup_7_pcrel -; CHECK: brcs .Ltmp8+8 ; encoding: [0bAAAAA000,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp8+8, kind: fixup_7_pcrel -; CHECK: brcs end ; encoding: [0bAAAAA000,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: end, kind: fixup_7_pcrel - -; INST-LABEL: : -; INST: brlo .+8 -; INST: brlo .+4 -; INST: brlo .+8 -; INST: brlo .+0 - - ; BRCC - brcc .+66 - brcc .-22 - brbc 0, .+66 - brbc 0, baz - -; CHECK: brcc .Ltmp9+66 ; encoding: [0bAAAAA000,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp9+66, kind: fixup_7_pcrel -; CHECK: brcc .Ltmp10-22 ; encoding: [0bAAAAA000,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp10-22, kind: fixup_7_pcrel -; CHECK: brcc .Ltmp11+66 ; encoding: [0bAAAAA000,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp11+66, kind: fixup_7_pcrel -; CHECK: brcc baz ; encoding: [0bAAAAA000,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: baz, kind: fixup_7_pcrel - -; INST: brsh .+66 -; INST: brsh .-22 -; INST: brsh .+66 -; INST: brsh .+0 - -; BRSH - brsh .+32 - brsh .+70 - brsh car - -; CHECK: brsh .Ltmp12+32 ; encoding: [0bAAAAA000,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp12+32, kind: fixup_7_pcrel -; CHECK: brsh .Ltmp13+70 ; encoding: [0bAAAAA000,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp13+70, kind: fixup_7_pcrel -; CHECK: brsh car ; encoding: [0bAAAAA000,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: car, kind: fixup_7_pcrel - -; INST: brsh .+32 -; INST: brsh .+70 -; INST: brsh .+0 - -baz: - - ; BRLO - brlo .+12 - brlo .+28 - brlo car - -; CHECK: brlo .Ltmp14+12 ; encoding: [0bAAAAA000,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp14+12, kind: fixup_7_pcrel -; CHECK: brlo .Ltmp15+28 ; encoding: [0bAAAAA000,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp15+28, kind: fixup_7_pcrel -; CHECK: brlo car ; encoding: [0bAAAAA000,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: car, kind: fixup_7_pcrel - -; INST-LABEL: : -; INST: brlo .+12 -; INST: brlo .+28 -; INST: brlo .+0 - - ; BRMI - brmi .+66 - brmi .+58 - brmi car - -; CHECK: brmi .Ltmp16+66 ; encoding: [0bAAAAA010,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp16+66, kind: fixup_7_pcrel -; CHECK: brmi .Ltmp17+58 ; encoding: [0bAAAAA010,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp17+58, kind: fixup_7_pcrel -; CHECK: brmi car ; encoding: [0bAAAAA010,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: car, kind: fixup_7_pcrel - -; INST: brmi .+66 -; INST: brmi .+58 -; INST: brmi .+0 - - ; BRPL - brpl .-12 - brpl .+18 - brpl car - -; CHECK: brpl .Ltmp18-12 ; encoding: [0bAAAAA010,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp18-12, kind: fixup_7_pcrel -; CHECK: brpl .Ltmp19+18 ; encoding: [0bAAAAA010,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp19+18, kind: fixup_7_pcrel -; CHECK: brpl car ; encoding: [0bAAAAA010,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: car, kind: fixup_7_pcrel - -; INST: brpl .-12 -; INST: brpl .+18 -; INST: brpl .+0 - -; BRGE - brge .+50 - brge .+42 - brge car - -; CHECK: brge .Ltmp20+50 ; encoding: [0bAAAAA100,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp20+50, kind: fixup_7_pcrel -; CHECK: brge .Ltmp21+42 ; encoding: [0bAAAAA100,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp21+42, kind: fixup_7_pcrel -; CHECK: brge car ; encoding: [0bAAAAA100,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: car, kind: fixup_7_pcrel - -; INST: brge .+50 -; INST: brge .+42 -; INST: brge .+0 - -car: - ; BRLT - brlt .+16 - brlt .+2 - brlt end - -; CHECK: brlt .Ltmp22+16 ; encoding: [0bAAAAA100,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp22+16, kind: fixup_7_pcrel -; CHECK: brlt .Ltmp23+2 ; encoding: [0bAAAAA100,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp23+2, kind: fixup_7_pcrel -; CHECK: brlt end ; encoding: [0bAAAAA100,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: end, kind: fixup_7_pcrel - -; INST-LABEL: : -; INST: brlt .+16 -; INST: brlt .+2 -; INST: brlt .+0 - - ; BRHS - brhs .-66 - brhs .+14 - brhs just_another_label - -; CHECK: brhs .Ltmp24-66 ; encoding: [0bAAAAA101,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp24-66, kind: fixup_7_pcrel -; CHECK: brhs .Ltmp25+14 ; encoding: [0bAAAAA101,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp25+14, kind: fixup_7_pcrel -; CHECK: brhs just_another_label ; encoding: [0bAAAAA101,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: just_another_label, kind: fixup_7_pcrel - -; INST: brhs .-66 -; INST: brhs .+14 -; INST: brhs .+0 - - ; BRHC - brhc .+12 - brhc .+14 - brhc just_another_label - -; CHECK: brhc .Ltmp26+12 ; encoding: [0bAAAAA101,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp26+12, kind: fixup_7_pcrel -; CHECK: brhc .Ltmp27+14 ; encoding: [0bAAAAA101,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp27+14, kind: fixup_7_pcrel -; CHECK: brhc just_another_label ; encoding: [0bAAAAA101,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: just_another_label, kind: fixup_7_pcrel - -; INST: brhc .+12 -; INST: brhc .+14 -; INST: brhc .+0 - - ; BRTS - brts .+18 - brts .+22 - brts just_another_label - -; CHECK: brts .Ltmp28+18 ; encoding: [0bAAAAA110,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp28+18, kind: fixup_7_pcrel -; CHECK: brts .Ltmp29+22 ; encoding: [0bAAAAA110,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp29+22, kind: fixup_7_pcrel -; CHECK: brts just_another_label ; encoding: [0bAAAAA110,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: just_another_label, kind: fixup_7_pcrel - -; INST: brts .+18 -; INST: brts .+22 -; INST: brts .+0 - -just_another_label: - ; BRTC - brtc .+52 - brtc .+50 - brtc end - -; CHECK: brtc .Ltmp30+52 ; encoding: [0bAAAAA110,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp30+52, kind: fixup_7_pcrel -; CHECK: brtc .Ltmp31+50 ; encoding: [0bAAAAA110,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp31+50, kind: fixup_7_pcrel -; CHECK: brtc end ; encoding: [0bAAAAA110,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: end, kind: fixup_7_pcrel - -; INST-LABEL: : -; INST: brtc .+52 -; INST: brtc .+50 -; INST: brtc .+0 - - ; BRVS - brvs .+18 - brvs .+32 - brvs end - -; CHECK: brvs .Ltmp32+18 ; encoding: [0bAAAAA011,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp32+18, kind: fixup_7_pcrel -; CHECK: brvs .Ltmp33+32 ; encoding: [0bAAAAA011,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp33+32, kind: fixup_7_pcrel -; CHECK: brvs end ; encoding: [0bAAAAA011,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: end, kind: fixup_7_pcrel - -; INST: brvs .+18 -; INST: brvs .+32 -; INST: brvs .+0 - - ; BRVC - brvc .-28 - brvc .-62 - brvc end - -; CHECK: brvc .Ltmp34-28 ; encoding: [0bAAAAA011,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp34-28, kind: fixup_7_pcrel -; CHECK: brvc .Ltmp35-62 ; encoding: [0bAAAAA011,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp35-62, kind: fixup_7_pcrel -; CHECK: brvc end ; encoding: [0bAAAAA011,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: end, kind: fixup_7_pcrel - -; INST: brvc .-28 -; INST: brvc .-62 -; INST: brvc .+0 - - ; BRIE - brie .+20 - brie .+40 - brie end - -; CHECK: brie .Ltmp36+20 ; encoding: [0bAAAAA111,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp36+20, kind: fixup_7_pcrel -; CHECK: brie .Ltmp37+40 ; encoding: [0bAAAAA111,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp37+40, kind: fixup_7_pcrel -; CHECK: brie end ; encoding: [0bAAAAA111,0b111100AA] -; CHECK: ; fixup A - offset: 0, value: end, kind: fixup_7_pcrel - -; INST: brie .+20 -; INST: brie .+40 -; INST: brie .+0 - - ; BRID - brid .+42 - brid .+62 - brid end - -; CHECK: brid .Ltmp38+42 ; encoding: [0bAAAAA111,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp38+42, kind: fixup_7_pcrel -; CHECK: brid .Ltmp39+62 ; encoding: [0bAAAAA111,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp39+62, kind: fixup_7_pcrel -; CHECK: brid end ; encoding: [0bAAAAA111,0b111101AA] -; CHECK: ; fixup A - offset: 0, value: end, kind: fixup_7_pcrel - -; INST: brid .+42 -; INST: brid .+62 -; INST: brid .+0 - -end: diff --git a/llvm/test/MC/AVR/inst-rcall.s b/llvm/test/MC/AVR/inst-rcall.s index 006013aa6ea946..a4ec32d05b1a43 100644 --- a/llvm/test/MC/AVR/inst-rcall.s +++ b/llvm/test/MC/AVR/inst-rcall.s @@ -1,27 +1,28 @@ ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; ; RUN: llvm-mc -filetype=obj -triple avr < %s \ -; RUN: | llvm-objdump -d - | FileCheck --check-prefix=INST %s - +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s foo: - rcall .+0 rcall .-8 rcall .+12 rcall .+46 .short 0xdfea -; CHECK: rcall .Ltmp0+0 ; encoding: [A,0b1101AAAA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp0+0, kind: fixup_13_pcrel -; CHECK: rcall .Ltmp1-8 ; encoding: [A,0b1101AAAA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp1-8, kind: fixup_13_pcrel -; CHECK: rcall .Ltmp2+12 ; encoding: [A,0b1101AAAA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp2+12, kind: fixup_13_pcrel -; CHECK: rcall .Ltmp3+46 ; encoding: [A,0b1101AAAA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp3+46, kind: fixup_13_pcrel +; CHECK: rcall (.Ltmp0+0)+2 ; encoding: [A,0b1101AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+0)+2, kind: fixup_13_pcrel +; CHECK: rcall (.Ltmp1-8)+2 ; encoding: [A,0b1101AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1-8)+2, kind: fixup_13_pcrel +; CHECK: rcall (.Ltmp2+12)+2 ; encoding: [A,0b1101AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp2+12)+2, kind: fixup_13_pcrel +; CHECK: rcall (.Ltmp3+46)+2 ; encoding: [A,0b1101AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp3+46)+2, kind: fixup_13_pcrel -; INST: 00 d0 rcall .+0 -; INST: fc df rcall .-8 -; INST: 06 d0 rcall .+12 -; INST: 17 d0 rcall .+46 -; INST: ea df rcall .-44 +; INST-LABEL: : +; INST-NEXT: 00 d0 rcall .+0 +; INST-NEXT: fc df rcall .-8 +; INST-NEXT: 06 d0 rcall .+12 +; INST-NEXT: 17 d0 rcall .+46 +; INST-NEXT: ea df rcall .-44 diff --git a/llvm/test/MC/AVR/inst-rjmp.s b/llvm/test/MC/AVR/inst-rjmp.s index 3dbac39e055ddf..cc843a58b55d2c 100644 --- a/llvm/test/MC/AVR/inst-rjmp.s +++ b/llvm/test/MC/AVR/inst-rjmp.s @@ -1,49 +1,56 @@ ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s +; ; RUN: llvm-mc -filetype=obj -triple avr < %s \ -; RUN: | llvm-objdump -d - | FileCheck --check-prefix=INST %s - +; RUN: | llvm-objdump -d - \ +; RUN: | FileCheck --check-prefix=INST %s foo: - rjmp .+2 rjmp .-2 rjmp foo rjmp .+8 rjmp end rjmp .+0 + end: rjmp .-4 rjmp .-6 + x: rjmp x .short 0xc00f -; CHECK: rjmp .Ltmp0+2 ; encoding: [A,0b1100AAAA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp0+2, kind: fixup_13_pcrel -; CHECK: rjmp .Ltmp1-2 ; encoding: [A,0b1100AAAA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp1-2, kind: fixup_13_pcrel -; CHECK: rjmp foo ; encoding: [A,0b1100AAAA] -; CHECK: ; fixup A - offset: 0, value: foo, kind: fixup_13_pcrel -; CHECK: rjmp .Ltmp2+8 ; encoding: [A,0b1100AAAA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp2+8, kind: fixup_13_pcrel -; CHECK: rjmp end ; encoding: [A,0b1100AAAA] -; CHECK: ; fixup A - offset: 0, value: end, kind: fixup_13_pcrel -; CHECK: rjmp .Ltmp3+0 ; encoding: [A,0b1100AAAA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp3+0, kind: fixup_13_pcrel -; CHECK: rjmp .Ltmp4-4 ; encoding: [A,0b1100AAAA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp4-4, kind: fixup_13_pcrel -; CHECK: rjmp .Ltmp5-6 ; encoding: [A,0b1100AAAA] -; CHECK: ; fixup A - offset: 0, value: .Ltmp5-6, kind: fixup_13_pcrel -; CHECK: rjmp x ; encoding: [A,0b1100AAAA] -; CHECK: ; fixup A - offset: 0, value: x, kind: fixup_13_pcrel +; CHECK: rjmp (.Ltmp0+2)+2 ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+2)+2, kind: fixup_13_pcrel +; CHECK: rjmp (.Ltmp1-2)+2 ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1-2)+2, kind: fixup_13_pcrel +; CHECK: rjmp foo ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: foo, kind: fixup_13_pcrel +; CHECK: rjmp (.Ltmp2+8)+2 ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp2+8)+2, kind: fixup_13_pcrel +; CHECK: rjmp end ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: end, kind: fixup_13_pcrel +; CHECK: rjmp (.Ltmp3+0)+2 ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp3+0)+2, kind: fixup_13_pcrel +; CHECK: rjmp (.Ltmp4-4)+2 ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp4-4)+2, kind: fixup_13_pcrel +; CHECK: rjmp (.Ltmp5-6)+2 ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp5-6)+2, kind: fixup_13_pcrel +; CHECK: rjmp x ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: x, kind: fixup_13_pcrel -; INST: 01 c0 rjmp .+2 -; INST: ff cf rjmp .-2 -; INST: 00 c0 rjmp .+0 -; INST: 04 c0 rjmp .+8 -; INST: 00 c0 rjmp .+0 -; INST: 00 c0 rjmp .+0 -; INST: fe cf rjmp .-4 -; INST: fd cf rjmp .-6 -; INST: 00 c0 rjmp .+0 -; INST: 0f c0 rjmp .+30 +; INST-LABEL: : +; INST-NEXT: 01 c0 rjmp .+2 +; INST-NEXT: ff cf rjmp .-2 +; INST-NEXT: fd cf rjmp .-6 +; INST-NEXT: 04 c0 rjmp .+8 +; INST-NEXT: 01 c0 rjmp .+2 +; INST-NEXT: 00 c0 rjmp .+0 +; INST-EMPTY: +; INST-LABEL: : +; INST-NEXT: fe cf rjmp .-4 +; INST-NEXT: fd cf rjmp .-6 +; INST-EMPTY: +; INST-LABEL: : +; INST-NEXT: ff cf rjmp .-2 +; INST-NEXT: 0f c0 rjmp .+30 From a919588df4f108cef5829363a9ec6a1968dbb03a Mon Sep 17 00:00:00 2001 From: Chris Apple Date: Fri, 30 Aug 2024 06:29:09 -0700 Subject: [PATCH 26/98] [compiler-rt][rtsan] NFC: Rename rtsan_on->rtsan_enable rtsan_off->rtsan_disable (#106727) This better matches lsan_enable and disable, which we are trying to emulate. --- compiler-rt/lib/rtsan/rtsan.cpp | 4 ++-- compiler-rt/lib/rtsan/rtsan.h | 6 +++--- compiler-rt/lib/rtsan/tests/rtsan_test_functional.cpp | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/compiler-rt/lib/rtsan/rtsan.cpp b/compiler-rt/lib/rtsan/rtsan.cpp index 8a7ff03c611c65..b2c4616b5fd0dc 100644 --- a/compiler-rt/lib/rtsan/rtsan.cpp +++ b/compiler-rt/lib/rtsan/rtsan.cpp @@ -58,11 +58,11 @@ SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_realtime_exit() { __rtsan::GetContextForThisThread().RealtimePop(); } -SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_off() { +SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_disable() { __rtsan::GetContextForThisThread().BypassPush(); } -SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_on() { +SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_enable() { __rtsan::GetContextForThisThread().BypassPop(); } diff --git a/compiler-rt/lib/rtsan/rtsan.h b/compiler-rt/lib/rtsan/rtsan.h index 3d665c98aed184..ae23609f97d2dc 100644 --- a/compiler-rt/lib/rtsan/rtsan.h +++ b/compiler-rt/lib/rtsan/rtsan.h @@ -38,11 +38,11 @@ SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_realtime_exit(); // Disable all RTSan error reporting. // Injected into the code if "nosanitize(realtime)" is on a function. -SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_off(); +SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_disable(); // Re-enable all RTSan error reporting. -// The counterpart to `__rtsan_off`. -SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_on(); +// The counterpart to `__rtsan_disable`. +SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_enable(); SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_expect_not_realtime(const char *intercepted_function_name); diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_functional.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_functional.cpp index 6e7ab016a4c6b2..5a86957170dcec 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_functional.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_functional.cpp @@ -204,10 +204,10 @@ TEST(TestRtsan, ThrowingAnExceptionDiesWhenRealtime) { TEST(TestRtsan, DoesNotDieIfTurnedOff) { std::mutex mutex; auto RealtimeUnsafeFunc = [&]() { - __rtsan_off(); + __rtsan_disable(); mutex.lock(); mutex.unlock(); - __rtsan_on(); + __rtsan_enable(); }; RealtimeInvoke(RealtimeUnsafeFunc); } From 7ffe67c17c524c2d3056c0721a33c7012dce3061 Mon Sep 17 00:00:00 2001 From: Orlando Cazalet-Hyams Date: Fri, 30 Aug 2024 14:39:11 +0100 Subject: [PATCH 27/98] [RemoveDIs] Fix asan-identified leak in unittest (#106723) Fixes issue found here https://github.com/llvm/llvm-project/pull/106691#issuecomment-2320960847 The issue wasn't in the code change itself, just the unittest; the trailing marker wasn't properly cleaned up. --- llvm/unittests/IR/BasicBlockDbgInfoTest.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp index 5615a4493d20a1..5ce14d3f6b9cef 100644 --- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp +++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp @@ -1569,14 +1569,12 @@ TEST(BasicBlockDbgInfoTest, CloneTrailingRecordsToEmptyBlock) { // The trailing records should've been absorbed into NewBB. EXPECT_FALSE(BB.getTrailingDbgRecords()); EXPECT_TRUE(NewBB->getTrailingDbgRecords()); - if (NewBB->getTrailingDbgRecords()) { - EXPECT_EQ( - llvm::range_size(NewBB->getTrailingDbgRecords()->getDbgRecordRange()), - 1u); + if (DbgMarker *Trailing = NewBB->getTrailingDbgRecords()) { + EXPECT_EQ(llvm::range_size(Trailing->getDbgRecordRange()), 1u); + // Drop the trailing records now, to prevent a cleanup assertion. + Trailing->eraseFromParent(); + NewBB->deleteTrailingDbgRecords(); } - - // Drop the trailing records now, to prevent a cleanup assertion. - NewBB->deleteTrailingDbgRecords(); } } // End anonymous namespace. From 4a10b4c0bd241f3a2d7162fe29f520af7da6840c Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Fri, 30 Aug 2024 15:43:20 +0200 Subject: [PATCH 28/98] [flang] fix flang builds with clang 20 after #100692 (#106718) #100692 changes clang template deduction, and an error was now emitted when building flang with top of the tree clang when mapping std::pow in intrinsics-library.cpp for constant folding `error: address of overloaded function 'pow' is ambiguous` See https://lab.llvm.org/buildbot/#/builders/4/builds/1670 I I am not expert enough to understand if the new error is justified or not here, but it is easy to help the compiler here with explicit wrappers to fix the builds. --- flang/lib/Evaluate/intrinsics-library.cpp | 25 ++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/flang/lib/Evaluate/intrinsics-library.cpp b/flang/lib/Evaluate/intrinsics-library.cpp index 65636b9956e780..ed28d8130808fa 100644 --- a/flang/lib/Evaluate/intrinsics-library.cpp +++ b/flang/lib/Evaluate/intrinsics-library.cpp @@ -255,6 +255,25 @@ struct HostRuntimeLibrary { static constexpr HostRuntimeMap map{table}; static_assert(map.Verify(), "map must be sorted"); }; + +// Helpers to map complex std::pow whose resolution in F2{std::pow} is +// ambiguous as of clang++ 20. +template +static std::complex StdPowF2( + const std::complex &x, const std::complex &y) { + return std::pow(x, y); +} +template +static std::complex StdPowF2A( + const HostT &x, const std::complex &y) { + return std::pow(x, y); +} +template +static std::complex StdPowF2B( + const std::complex &x, const HostT &y) { + return std::pow(x, y); +} + template struct HostRuntimeLibrary, LibraryVersion::Libm> { using F = FuncPointer, const std::complex &>; @@ -275,9 +294,9 @@ struct HostRuntimeLibrary, LibraryVersion::Libm> { FolderFactory::Create("cosh"), FolderFactory::Create("exp"), FolderFactory::Create("log"), - FolderFactory::Create("pow"), - FolderFactory::Create("pow"), - FolderFactory::Create("pow"), + FolderFactory::Create("pow"), + FolderFactory::Create("pow"), + FolderFactory::Create("pow"), FolderFactory::Create("sin"), FolderFactory::Create("sinh"), FolderFactory::Create("sqrt"), From 96ad495289d241fc8f445ebdf4a9c1a6f6ff408e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 30 Aug 2024 14:38:59 +0100 Subject: [PATCH 29/98] [SLP] vectorizeChainsInBlock - remove superfluous continue at the end of for loop. NFC. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 345b01b82c6aa4..7b80b9ad7ce38d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -19149,7 +19149,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } // Undefs come last. assert(U1 && U2 && "The only thing left should be undef & undef."); - continue; } return false; }; From b719c9255126aeba7a9455fd026471c45c988e2d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 30 Aug 2024 14:40:32 +0100 Subject: [PATCH 30/98] [SLP] findBestRootPair - fix incorrect argument name comment. NFC. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7b80b9ad7ce38d..4c0a1c4c094b95 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2605,7 +2605,7 @@ class BoUpSLP { int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first, Candidates[I].second, /*U1=*/nullptr, /*U2=*/nullptr, - /*Level=*/1, std::nullopt); + /*CurrLevel=*/1, std::nullopt); if (Score > BestScore) { BestScore = Score; Index = I; From 362d37aeab7e8ba5dc4125480de3d45cc6bb23dc Mon Sep 17 00:00:00 2001 From: MichelleCDjunaidi Date: Fri, 30 Aug 2024 21:55:13 +0800 Subject: [PATCH 31/98] Update clang tidy Contributing guide (#106672) Update the documentation to direct new users to the Github instead of the discontinued Phabricator archive. Also details more ways and information regarding clang-query usage. Partially resolves/disclaims #106656 and #106663 as per discussion in https://discourse.llvm.org/t/inconsistency-between-hasdescendant-in-clang-query-and-clang-libtooling-matchers/80799/. Also updates the out-of-tree guide. For context, I recently went through the Contributing guide while writing https://github.com/llvm/llvm-project/pull/102299, and many of these updates were from my experience trying to follow the guide. e.g. I was trying to link the shared library of an out-of-tree check as SHARED in CMake and encountered duplicate symbols like _ZTIN5clang4tidy14ClangTidyCheckE. It wasn't until I saw https://github.com/llvm/llvm-project/commit/84f137a590e7de25c4105303e5938c40566c2dfb that I found out I had to use MODULE. I also encountered the clang-query difference which was a surprise as the documentation said the two matchers were "virtually identical". Also, the -header-filter thing tripped me out until I found https://github.com/llvm/llvm-project/issues/25590 and https://github.com/llvm/llvm-project/pull/91400. Usually, when people say restrict and filter, they mean filter out (since -header-filter instead includes/filters in said headers). --- .../docs/clang-tidy/Contributing.rst | 64 +++++++++++++------ 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/clang-tools-extra/docs/clang-tidy/Contributing.rst b/clang-tools-extra/docs/clang-tidy/Contributing.rst index 92074bd4dae8ba..b04809c3308f17 100644 --- a/clang-tools-extra/docs/clang-tidy/Contributing.rst +++ b/clang-tools-extra/docs/clang-tidy/Contributing.rst @@ -127,14 +127,15 @@ Writing a clang-tidy Check So you have an idea of a useful check for :program:`clang-tidy`. -First, if you're not familiar with LLVM development, read through the `Getting -Started with LLVM`_ document for instructions on setting up your workflow and +First, if you're not familiar with LLVM development, read through the `Getting Started +with the LLVM System`_ document for instructions on setting up your workflow and the `LLVM Coding Standards`_ document to familiarize yourself with the coding -style used in the project. For code reviews we mostly use `LLVM Phabricator`_. +style used in the project. For code reviews we currently use `LLVM Github`_, +though historically we used Phabricator. -.. _Getting Started with LLVM: https://llvm.org/docs/GettingStarted.html +.. _Getting Started with the LLVM System: https://llvm.org/docs/GettingStarted.html .. _LLVM Coding Standards: https://llvm.org/docs/CodingStandards.html -.. _LLVM Phabricator: https://llvm.org/docs/Phabricator.html +.. _LLVM Github: https://github.com/llvm/llvm-project Next, you need to decide which module the check belongs to. Modules are located in subdirectories of `clang-tidy/ @@ -336,13 +337,24 @@ a starting point for your test cases. A rough outline of the process looks like The quickest way to prototype your matcher is to use :program:`clang-query` to interactively build up your matcher. For complicated matchers, build up a matching expression incrementally and use :program:`clang-query`'s ``let`` command to save named -matching expressions to simplify your matcher. Just like breaking up a huge function -into smaller chunks with intention-revealing names can help you understand a complex -algorithm, breaking up a matcher into smaller matchers with intention-revealing names -can help you understand a complicated matcher. Once you have a working matcher, the -C++ API will be virtually identical to your interactively constructed matcher. You can -use local variables to preserve your intention-revealing names that you applied to -nested matchers. +matching expressions to simplify your matcher. + +.. code-block:: console + clang-query> let c1 cxxRecordDecl() + clang-query> match c1 + +Alternatively, pressing the tab key after a previous matcher's open parentheses would also +show which matchers can be chained with the previous matcher, though some matchers that work +may not be listed. + +Just like breaking up a huge function into smaller chunks with intention-revealing names +can help you understand a complex algorithm, breaking up a matcher into smaller matchers +with intention-revealing names can help you understand a complicated matcher. + +Once you have a working clang-query matcher, the C++ API matchers will be the same or similar +to your interactively constructed matcher (there can be cases where they differ slightly). +You can use local variables to preserve your intention-revealing names that you applied +to nested matchers. Creating private matchers ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -646,10 +658,13 @@ directory. The path to this directory is available in a lit test with the varia Out-of-tree check plugins ------------------------- + Developing an out-of-tree check as a plugin largely follows the steps -outlined above. The plugin is a shared library whose code lives outside +outlined above, including creating a new module and doing the hacks to +register the module. The plugin is a shared library whose code lives outside the clang-tidy build system. Build and link this shared library against -LLVM as done for other kinds of Clang plugins. +LLVM as done for other kinds of Clang plugins. If using CMake, use the keyword +``MODULE`` while invoking ``add_library`` or ``llvm_add_library``. The plugin can be loaded by passing `-load` to `clang-tidy` in addition to the names of the checks to enable. @@ -664,6 +679,19 @@ compiled against the version of clang-tidy that will be loading the plugin. The plugins can use threads, TLS, or any other facilities available to in-tree code which is accessible from the external headers. +Note that testing out-of-tree checks might involve getting ``llvm-lit`` from an LLVM +installation compiled from source. See `Getting Started with the LLVM System`_ for ways +to do so. + +Alternatively, get `lit`_ following the `test-suite guide`_ and get the `FileCheck`_ binary, +and write a version of `check_clang_tidy.py`_ to suit your needs. + +.. _Getting Started with the LLVM System: https://llvm.org/docs/GettingStarted.html +.. _test-suite guide: https://llvm.org/docs/TestSuiteGuide.html +.. _lit: https://llvm.org/docs/CommandGuide/lit.html +.. _FileCheck: https://llvm.org/docs/CommandGuide/FileCheck.html +.. _check_clang_tidy.py: https://github.com/llvm/llvm-project/blob/main/clang-tools-extra/test/clang-tidy/check_clang_tidy.py + Running clang-tidy on LLVM -------------------------- @@ -688,10 +716,10 @@ warnings and errors. The script provides multiple configuration flags. * To restrict the files examined you can provide one or more regex arguments that the file names are matched against. - ``run-clang-tidy.py clang-tidy/.*Check\.cpp`` will only analyze clang-tidy + ``run-clang-tidy.py clang-tidy/.*Check\.cpp`` will only analyze `clang-tidy` checks. It may also be necessary to restrict the header files that warnings - are displayed from using the ``-header-filter`` flag. It has the same behavior - as the corresponding :program:`clang-tidy` flag. + are displayed from by using the ``-header-filter`` and ``-exclude-header-filter`` flags. + They have the same behavior as the corresponding :program:`clang-tidy` flags. * To apply suggested fixes ``-fix`` can be passed as an argument. This gathers all changes in a temporary directory and applies them. Passing ``-format`` @@ -758,4 +786,4 @@ There is only one argument that controls profile storage: * If you run :program:`clang-tidy` from within ``/foo`` directory, and specify ``-store-check-profile=.``, then the profile will still be saved to - ``/foo/-example.cpp.json`` + ``/foo/-example.cpp.json`` \ No newline at end of file From 24977395592fb3a47d0356b6e9e6d25358a521c5 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Fri, 30 Aug 2024 15:03:50 +0100 Subject: [PATCH 32/98] [AArch64][AsmParser] Directives should clear transitively implied features (#106625) The commit ff3f3a54e2d1 made it possible to enable transitively implied features when parsing assembler directives. For example enabling sve2 also enables sve. This patch allows disabling features which depend on each other. For example disabling sve also disables sve2. --- .../AArch64/AsmParser/AArch64AsmParser.cpp | 102 ++++++++---------- .../MC/AArch64/SVE/directive-arch-negative.s | 8 ++ .../SVE/directive-arch_extension-negative.s | 7 +- .../MC/AArch64/SVE/directive-cpu-negative.s | 7 +- .../test/MC/AArch64/directive-arch-negative.s | 5 +- .../directive-arch_extension-negative.s | 14 ++- 6 files changed, 84 insertions(+), 59 deletions(-) create mode 100644 llvm/test/MC/AArch64/SVE/directive-arch-negative.s diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 37add682b150e7..34c0fad45fc499 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -6947,10 +6947,14 @@ static void ExpandCryptoAEK(const AArch64::ArchInfo &ArchInfo, } } +static SMLoc incrementLoc(SMLoc L, int Offset) { + return SMLoc::getFromPointer(L.getPointer() + Offset); +} + /// parseDirectiveArch /// ::= .arch token bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { - SMLoc ArchLoc = getLoc(); + SMLoc CurLoc = getLoc(); StringRef Arch, ExtensionString; std::tie(Arch, ExtensionString) = @@ -6958,7 +6962,7 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { const AArch64::ArchInfo *ArchInfo = AArch64::parseArch(Arch); if (!ArchInfo) - return Error(ArchLoc, "unknown arch name"); + return Error(CurLoc, "unknown arch name"); if (parseToken(AsmToken::EndOfStatement)) return true; @@ -6978,27 +6982,30 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { ExtensionString.split(RequestedExtensions, '+'); ExpandCryptoAEK(*ArchInfo, RequestedExtensions); + CurLoc = incrementLoc(CurLoc, Arch.size()); - FeatureBitset Features = STI.getFeatureBits(); - setAvailableFeatures(ComputeAvailableFeatures(Features)); for (auto Name : RequestedExtensions) { + // Advance source location past '+'. + CurLoc = incrementLoc(CurLoc, 1); + bool EnableFeature = !Name.consume_front_insensitive("no"); - for (const auto &Extension : ExtensionMap) { - if (Extension.Name != Name) - continue; + auto It = llvm::find_if(ExtensionMap, [&Name](const auto &Extension) { + return Extension.Name == Name; + }); - if (Extension.Features.none()) - report_fatal_error("unsupported architectural extension: " + Name); + if (It == std::end(ExtensionMap)) + Error(CurLoc, "unsupported architectural extension: " + Name); - FeatureBitset ToggleFeatures = - EnableFeature - ? STI.SetFeatureBitsTransitively(~Features & Extension.Features) - : STI.ToggleFeature(Features & Extension.Features); - setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures)); - break; - } + if (EnableFeature) + STI.SetFeatureBitsTransitively(It->Features); + else + STI.ClearFeatureBitsTransitively(It->Features); + + CurLoc = incrementLoc(CurLoc, Name.size()); } + FeatureBitset Features = ComputeAvailableFeatures(STI.getFeatureBits()); + setAvailableFeatures(Features); return false; } @@ -7018,28 +7025,21 @@ bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) { Name = Name.substr(2); } - MCSubtargetInfo &STI = copySTI(); - FeatureBitset Features = STI.getFeatureBits(); - for (const auto &Extension : ExtensionMap) { - if (Extension.Name != Name) - continue; - - if (Extension.Features.none()) - return Error(ExtLoc, "unsupported architectural extension: " + Name); - - FeatureBitset ToggleFeatures = - EnableFeature - ? STI.SetFeatureBitsTransitively(~Features & Extension.Features) - : STI.ToggleFeature(Features & Extension.Features); - setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures)); - return false; - } + auto It = llvm::find_if(ExtensionMap, [&Name](const auto &Extension) { + return Extension.Name == Name; + }); - return Error(ExtLoc, "unknown architectural extension: " + Name); -} + if (It == std::end(ExtensionMap)) + return Error(ExtLoc, "unsupported architectural extension: " + Name); -static SMLoc incrementLoc(SMLoc L, int Offset) { - return SMLoc::getFromPointer(L.getPointer() + Offset); + MCSubtargetInfo &STI = copySTI(); + if (EnableFeature) + STI.SetFeatureBitsTransitively(It->Features); + else + STI.ClearFeatureBitsTransitively(It->Features); + FeatureBitset Features = ComputeAvailableFeatures(STI.getFeatureBits()); + setAvailableFeatures(Features); + return false; } /// parseDirectiveCPU @@ -7075,30 +7075,22 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { bool EnableFeature = !Name.consume_front_insensitive("no"); - bool FoundExtension = false; - for (const auto &Extension : ExtensionMap) { - if (Extension.Name != Name) - continue; - - if (Extension.Features.none()) - report_fatal_error("unsupported architectural extension: " + Name); - - FeatureBitset Features = STI.getFeatureBits(); - FeatureBitset ToggleFeatures = - EnableFeature - ? STI.SetFeatureBitsTransitively(~Features & Extension.Features) - : STI.ToggleFeature(Features & Extension.Features); - setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures)); - FoundExtension = true; + auto It = llvm::find_if(ExtensionMap, [&Name](const auto &Extension) { + return Extension.Name == Name; + }); - break; - } + if (It == std::end(ExtensionMap)) + Error(CurLoc, "unsupported architectural extension: " + Name); - if (!FoundExtension) - Error(CurLoc, "unsupported architectural extension"); + if (EnableFeature) + STI.SetFeatureBitsTransitively(It->Features); + else + STI.ClearFeatureBitsTransitively(It->Features); CurLoc = incrementLoc(CurLoc, Name.size()); } + FeatureBitset Features = ComputeAvailableFeatures(STI.getFeatureBits()); + setAvailableFeatures(Features); return false; } diff --git a/llvm/test/MC/AArch64/SVE/directive-arch-negative.s b/llvm/test/MC/AArch64/SVE/directive-arch-negative.s new file mode 100644 index 00000000000000..e3029c16ffc8a6 --- /dev/null +++ b/llvm/test/MC/AArch64/SVE/directive-arch-negative.s @@ -0,0 +1,8 @@ +// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s + +// Check that setting +nosve implies +nosve2 +.arch armv9-a+nosve + +adclb z0.s, z1.s, z31.s +// CHECK: error: instruction requires: sve2 +// CHECK-NEXT: adclb z0.s, z1.s, z31.s diff --git a/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s index 661f13974d0bc8..31118f7490d00d 100644 --- a/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s +++ b/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s @@ -1,7 +1,12 @@ // RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s -.arch_extension nosve +.arch_extension sve2+nosve ptrue p0.b, pow2 // CHECK: error: instruction requires: sve or sme // CHECK-NEXT: ptrue p0.b, pow2 + +// Check that setting +nosve implies +nosve2 +adclb z0.s, z1.s, z31.s +// CHECK: error: instruction requires: sve2 +// CHECK-NEXT: adclb z0.s, z1.s, z31.s diff --git a/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s b/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s index 82acc1b0b0be9b..6ba537ca70609e 100644 --- a/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s +++ b/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s @@ -1,6 +1,11 @@ // RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s -.cpu generic+sve+nosve +.cpu generic+sve2+nosve ptrue p0.b, pow2 // CHECK: error: instruction requires: sve or sme // CHECK-NEXT: ptrue p0.b, pow2 + +// Check that setting +nosve implies +nosve2 +adclb z0.s, z1.s, z31.s +// CHECK: error: instruction requires: sve2 +// CHECK-NEXT: adclb z0.s, z1.s, z31.s diff --git a/llvm/test/MC/AArch64/directive-arch-negative.s b/llvm/test/MC/AArch64/directive-arch-negative.s index f60759899aa6c9..406507d5fc8f4d 100644 --- a/llvm/test/MC/AArch64/directive-arch-negative.s +++ b/llvm/test/MC/AArch64/directive-arch-negative.s @@ -12,10 +12,13 @@ # CHECK-NEXT: aese v0.8h, v1.8h # CHECK-NEXT: ^ -// We silently ignore invalid features. .arch armv8+foo aese v0.8h, v1.8h +# CHECK: error: unsupported architectural extension: foo +# CHECK-NEXT: .arch armv8+foo +# CHECK-NEXT: ^ + # CHECK: error: invalid operand for instruction # CHECK-NEXT: aese v0.8h, v1.8h # CHECK-NEXT: ^ diff --git a/llvm/test/MC/AArch64/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/directive-arch_extension-negative.s index 1c1cfc9d33e3ed..1843af56555461 100644 --- a/llvm/test/MC/AArch64/directive-arch_extension-negative.s +++ b/llvm/test/MC/AArch64/directive-arch_extension-negative.s @@ -4,7 +4,7 @@ // RUN: -filetype asm -o - %s 2>&1 | FileCheck %s .arch_extension axp64 -// CHECK: error: unknown architectural extension: axp64 +// CHECK: error: unsupported architectural extension: axp64 // CHECK-NEXT: .arch_extension axp64 crc32cx w0, w1, x3 @@ -49,6 +49,8 @@ fminnm d0, d0, d1 // CHECK: [[@LINE-1]]:1: error: instruction requires: fp // CHECK-NEXT: fminnm d0, d0, d1 +// nofp implied nosimd, so reinstate it +.arch_extension simd addp v0.4s, v0.4s, v0.4s // CHECK-NOT: [[@LINE-1]]:1: error: instruction requires: neon .arch_extension nosimd @@ -70,6 +72,8 @@ casa w5, w7, [x20] // CHECK: [[@LINE-1]]:1: error: instruction requires: lse // CHECK-NEXT: casa w5, w7, [x20] +// nolse implied nolse128, so reinstate it +.arch_extension lse128 swpp x0, x2, [x3] // CHECK-NOT: [[@LINE-1]]:1: error: instruction requires: lse128 .arch_extension nolse128 @@ -84,6 +88,8 @@ cfp rctx, x0 // CHECK: [[@LINE-1]]:5: error: CFPRCTX requires: predres // CHECK-NEXT: cfp rctx, x0 +// nopredres implied nopredres2, so reinstate it +.arch_extension predres2 cosp rctx, x0 // CHECK-NOT: [[@LINE-1]]:6: error: COSP requires: predres2 .arch_extension nopredres2 @@ -133,6 +139,8 @@ ldapr x0, [x1] // CHECK: [[@LINE-1]]:1: error: instruction requires: rcpc // CHECK-NEXT: ldapr x0, [x1] +// norcpc implied norcpc3, so reinstate it +.arch_extension rcpc3 stilp w24, w0, [x16, #-8]! // CHECK-NOT: [[@LINE-1]]:1: error: instruction requires: rcpc3 .arch_extension norcpc3 @@ -169,6 +177,8 @@ cpyfp [x0]!, [x1]!, x2! // CHECK: [[@LINE-1]]:1: error: instruction requires: mops // CHECK-NEXT: cpyfp [x0]!, [x1]!, x2! +// nolse128 implied nod128, so reinstate it +.arch_extension d128 // This needs to come before `.arch_extension nothe` as it uses an instruction // that requires both the and d128 sysp #0, c2, c0, #0, x0, x1 @@ -204,6 +214,8 @@ umax x0, x1, x2 // CHECK: [[@LINE-1]]:1: error: instruction requires: cssc // CHECK-NEXT: umax x0, x1, x2 +// noras implied norasv2, so reinstate it +.arch_extension rasv2 mrs x0, ERXGSR_EL1 // CHECK-NOT: [[@LINE-1]]:9: error: expected readable system register .arch_extension norasv2 From c792de28dfaf3a13703e83e4eb09dd44574b3a3e Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 30 Aug 2024 15:05:29 +0100 Subject: [PATCH 33/98] [libcxx][test] Add macro for when long double is just double (#106708) This removes the need for the long list of platforms in strong_order_long_double_verify. --- .../strong_order_long_double.verify.cpp | 19 ++++--------------- .../numerics/bit/bit.cast/bit_cast.pass.cpp | 2 +- libcxx/test/support/test_macros.h | 4 ++++ 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp b/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp index c9c2ba20021491..cd032d48648953 100644 --- a/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp +++ b/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp @@ -8,21 +8,6 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 -// The following platforms have sizeof(long double) == sizeof(double), so this test doesn't apply to them. -// This test does apply to aarch64 where Arm's AAPCS64 is followed. There they are different sizes. -// XFAIL: target={{arm64|arm64e|armv(7|8)(l|m)?|powerpc|powerpc64}}-{{.+}} - -// MSVC configurations have long double equal to regular double on all -// architectures. -// XFAIL: target={{.+}}-pc-windows-msvc - -// ARM/AArch64 MinGW also has got long double equal to regular double, just -// like MSVC (thus match both MinGW and MSVC here, for those architectures). -// XFAIL: target={{aarch64|armv7}}-{{.*}}-windows-{{.+}} - -// Android's 32-bit x86 target has long double equal to regular double. -// XFAIL: target=i686-{{.+}}-android{{.*}} - // // template constexpr strong_ordering strong_order(const T& a, const T& b); @@ -37,5 +22,9 @@ void f() { long double ld = 3.14; +#ifdef TEST_LONG_DOUBLE_IS_DOUBLE + (void)ld; // expected-no-diagnostics +#else (void)std::strong_order(ld, ld); // expected-error@*:* {{std::strong_order is unimplemented for this floating-point type}} +#endif } diff --git a/libcxx/test/std/numerics/bit/bit.cast/bit_cast.pass.cpp b/libcxx/test/std/numerics/bit/bit.cast/bit_cast.pass.cpp index f73877416a7170..044589298439c1 100644 --- a/libcxx/test/std/numerics/bit/bit.cast/bit_cast.pass.cpp +++ b/libcxx/test/std/numerics/bit/bit.cast/bit_cast.pass.cpp @@ -229,7 +229,7 @@ bool tests() { test_roundtrip_through_nested_T(i); test_roundtrip_through_buffer(i); -#if __SIZEOF_LONG_DOUBLE__ == __SIZEOF_DOUBLE__ +#ifdef TEST_LONG_DOUBLE_IS_DOUBLE test_roundtrip_through(i); #endif #if defined(__SIZEOF_INT128__) && __SIZEOF_LONG_DOUBLE__ == __SIZEOF_INT128__ && \ diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h index 6f7ec3aa0c1f9f..5d4c1a65cfafb2 100644 --- a/libcxx/test/support/test_macros.h +++ b/libcxx/test/support/test_macros.h @@ -511,4 +511,8 @@ inline Tp const& DoNotOptimize(Tp const& value) { # define TEST_CONSTEXPR_OPERATOR_NEW #endif +#if __SIZEOF_LONG_DOUBLE__ == __SIZEOF_DOUBLE__ +# define TEST_LONG_DOUBLE_IS_DOUBLE +#endif + #endif // SUPPORT_TEST_MACROS_HPP From f4ea19b47e1e5af6682d94ad05ac2e7bca64cf73 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Fri, 30 Aug 2024 16:13:47 +0200 Subject: [PATCH 34/98] [libc++][syncbuf] Implement LWG3253 (#99778) Closes #100264 --- libcxx/docs/Status/Cxx20Issues.csv | 2 +- libcxx/include/syncstream | 9 +++++++-- .../syncstream.syncbuf.cons/cons.default.pass.cpp | 7 +++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv index 9c65ff9a536407..e5d2498473ecde 100644 --- a/libcxx/docs/Status/Cxx20Issues.csv +++ b/libcxx/docs/Status/Cxx20Issues.csv @@ -172,7 +172,7 @@ "`LWG3221 `__","Result of ``year_month``\ arithmetic with ``months``\ is ambiguous","2019-11 (Belfast)","|Complete|","8.0","" "`LWG3235 `__","``parse``\ manipulator without abbreviation is not callable","2019-11 (Belfast)","","","" "`LWG3246 `__","LWG3246: What are the constraints on the template parameter of `basic_format_arg`?","2019-11 (Belfast)","|Nothing To Do|","","" -"`LWG3253 `__","``basic_syncbuf::basic_syncbuf()``\ should not be explicit","2019-11 (Belfast)","","","" +"`LWG3253 `__","``basic_syncbuf::basic_syncbuf()``\ should not be explicit","2019-11 (Belfast)","|Complete|","20.0","" "`LWG3245 `__","Unnecessary restriction on ``'%p'``\ parse specifier","2019-11 (Belfast)","","","" "`LWG3244 `__","Constraints for ``Source``\ in |sect|\ [fs.path.req] insufficiently constrainty","2019-11 (Belfast)","","","" "`LWG3241 `__","``chrono-spec``\ grammar ambiguity in |sect|\ [time.format]","2019-11 (Belfast)","|Complete|","16.0","" diff --git a/libcxx/include/syncstream b/libcxx/include/syncstream index e6f35b6f428eda..a0617f4acf5b6a 100644 --- a/libcxx/include/syncstream +++ b/libcxx/include/syncstream @@ -46,7 +46,9 @@ namespace std { using streambuf_type = basic_streambuf; // [syncstream.syncbuf.cons], construction and destruction - explicit basic_syncbuf(streambuf_type* obuf = nullptr) + basic_syncbuf() + : basic_syncbuf(nullptr) {} + explicit basic_syncbuf(streambuf_type* obuf) : basic_syncbuf(obuf, Allocator()) {} basic_syncbuf(streambuf_type*, const Allocator&); basic_syncbuf(basic_syncbuf&&); @@ -253,7 +255,10 @@ public: // [syncstream.syncbuf.cons], construction and destruction - _LIBCPP_HIDE_FROM_ABI explicit basic_syncbuf(streambuf_type* __obuf = nullptr) + _LIBCPP_HIDE_FROM_ABI basic_syncbuf() + : basic_syncbuf(nullptr) {} + + _LIBCPP_HIDE_FROM_ABI explicit basic_syncbuf(streambuf_type* __obuf) : basic_syncbuf(__obuf, _Allocator()) {} _LIBCPP_HIDE_FROM_ABI basic_syncbuf(streambuf_type* __obuf, _Allocator const& __alloc) diff --git a/libcxx/test/std/input.output/syncstream/syncbuf/syncstream.syncbuf.cons/cons.default.pass.cpp b/libcxx/test/std/input.output/syncstream/syncbuf/syncstream.syncbuf.cons/cons.default.pass.cpp index aa0eb2d41e0f01..beebc36c76758e 100644 --- a/libcxx/test/std/input.output/syncstream/syncbuf/syncstream.syncbuf.cons/cons.default.pass.cpp +++ b/libcxx/test/std/input.output/syncstream/syncbuf/syncstream.syncbuf.cons/cons.default.pass.cpp @@ -25,8 +25,15 @@ #include "constexpr_char_traits.h" #include "test_allocator.h" +template +std::basic_syncbuf lwg3253_default_constructor_is_not_explicit() { + return {}; +} + template void test() { + lwg3253_default_constructor_is_not_explicit(); + { using Buf = std::basic_syncbuf; static_assert(std::default_initializable); From ab40ae8ff9f87b6e3d68cab2c47d692016ede958 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 30 Aug 2024 07:18:55 -0700 Subject: [PATCH 35/98] [lldb] Store SupportFiles in SourceManager::File (NFC) (#106639) To support detecting MD5 checksum mismatches, store a SupportFile rather than a plain FileSpec in SourceManager::File. --- lldb/include/lldb/Core/SourceManager.h | 22 +-- lldb/source/Commands/CommandObjectSource.cpp | 4 +- lldb/source/Core/IOHandlerCursesGUI.cpp | 14 +- lldb/source/Core/SourceManager.cpp | 139 +++++++++++-------- lldb/unittests/Core/SourceManagerTest.cpp | 12 +- 5 files changed, 108 insertions(+), 83 deletions(-) diff --git a/lldb/include/lldb/Core/SourceManager.h b/lldb/include/lldb/Core/SourceManager.h index 5239ac6f4055f5..8feeb4347dd52e 100644 --- a/lldb/include/lldb/Core/SourceManager.h +++ b/lldb/include/lldb/Core/SourceManager.h @@ -37,8 +37,8 @@ class SourceManager { const SourceManager::File &rhs); public: - File(const FileSpec &file_spec, lldb::TargetSP target_sp); - File(const FileSpec &file_spec, lldb::DebuggerSP debugger_sp); + File(lldb::SupportFileSP support_file_sp, lldb::TargetSP target_sp); + File(lldb::SupportFileSP support_file_sp, lldb::DebuggerSP debugger_sp); bool ModificationTimeIsStale() const; bool PathRemappingIsStale() const; @@ -56,7 +56,10 @@ class SourceManager { bool LineIsValid(uint32_t line); - const FileSpec &GetFileSpec() { return m_file_spec; } + lldb::SupportFileSP GetSupportFile() const { + assert(m_support_file_sp && "SupportFileSP must always be valid"); + return m_support_file_sp; + } uint32_t GetSourceMapModificationID() const { return m_source_map_mod_id; } @@ -70,15 +73,13 @@ class SourceManager { protected: /// Set file and update modification time. - void SetFileSpec(FileSpec file_spec); + void SetSupportFile(lldb::SupportFileSP support_file_sp); bool CalculateLineOffsets(uint32_t line = UINT32_MAX); - FileSpec m_file_spec_orig; // The original file spec that was used (can be - // different from m_file_spec) - FileSpec m_file_spec; // The actually file spec being used (if the target - // has source mappings, this might be different from - // m_file_spec_orig) + /// The support file. If the target has source mappings, this might be + /// different from the original support file passed to the constructor. + lldb::SupportFileSP m_support_file_sp; // Keep the modification time that this file data is valid for llvm::sys::TimePoint<> m_mod_time; @@ -93,7 +94,8 @@ class SourceManager { lldb::TargetWP m_target_wp; private: - void CommonInitializer(const FileSpec &file_spec, lldb::TargetSP target_sp); + void CommonInitializer(lldb::SupportFileSP support_file_sp, + lldb::TargetSP target_sp); }; typedef std::shared_ptr FileSP; diff --git a/lldb/source/Commands/CommandObjectSource.cpp b/lldb/source/Commands/CommandObjectSource.cpp index 5ddd46ac5fdc07..1a0629c6765d41 100644 --- a/lldb/source/Commands/CommandObjectSource.cpp +++ b/lldb/source/Commands/CommandObjectSource.cpp @@ -1076,8 +1076,8 @@ class CommandObjectSourceList : public CommandObjectParsed { target.GetSourceManager().GetLastFile()); if (last_file_sp) { const bool show_inlines = true; - m_breakpoint_locations.Reset(last_file_sp->GetFileSpec(), 0, - show_inlines); + m_breakpoint_locations.Reset( + last_file_sp->GetSupportFile()->GetSpecOnly(), 0, show_inlines); SearchFilterForUnconstrainedSearches target_search_filter( target.shared_from_this()); target_search_filter.Search(m_breakpoint_locations); diff --git a/lldb/source/Core/IOHandlerCursesGUI.cpp b/lldb/source/Core/IOHandlerCursesGUI.cpp index d922d32f910583..8f44e3d0cd016b 100644 --- a/lldb/source/Core/IOHandlerCursesGUI.cpp +++ b/lldb/source/Core/IOHandlerCursesGUI.cpp @@ -6894,8 +6894,8 @@ class SourceFileWindowDelegate : public WindowDelegate { if (context_changed) m_selected_line = m_pc_line; - if (m_file_sp && - m_file_sp->GetFileSpec() == m_sc.line_entry.GetFile()) { + if (m_file_sp && m_file_sp->GetSupportFile()->GetSpecOnly() == + m_sc.line_entry.GetFile()) { // Same file, nothing to do, we should either have the lines or // not (source file missing) if (m_selected_line >= static_cast(m_first_visible_line)) { @@ -7001,7 +7001,8 @@ class SourceFileWindowDelegate : public WindowDelegate { LineEntry bp_loc_line_entry; if (bp_loc_sp->GetAddress().CalculateSymbolContextLineEntry( bp_loc_line_entry)) { - if (m_file_sp->GetFileSpec() == bp_loc_line_entry.GetFile()) { + if (m_file_sp->GetSupportFile()->GetSpecOnly() == + bp_loc_line_entry.GetFile()) { bp_lines.insert(bp_loc_line_entry.line); } } @@ -7332,7 +7333,7 @@ class SourceFileWindowDelegate : public WindowDelegate { if (exe_ctx.HasProcessScope() && exe_ctx.GetProcessRef().IsAlive()) { BreakpointSP bp_sp = exe_ctx.GetTargetRef().CreateBreakpoint( nullptr, // Don't limit the breakpoint to certain modules - m_file_sp->GetFileSpec(), // Source file + m_file_sp->GetSupportFile()->GetSpecOnly(), // Source file m_selected_line + 1, // Source line number (m_selected_line is zero based) 0, // Unspecified column. @@ -7478,7 +7479,8 @@ class SourceFileWindowDelegate : public WindowDelegate { LineEntry bp_loc_line_entry; if (bp_loc_sp->GetAddress().CalculateSymbolContextLineEntry( bp_loc_line_entry)) { - if (m_file_sp->GetFileSpec() == bp_loc_line_entry.GetFile() && + if (m_file_sp->GetSupportFile()->GetSpecOnly() == + bp_loc_line_entry.GetFile() && m_selected_line + 1 == bp_loc_line_entry.line) { bool removed = exe_ctx.GetTargetRef().RemoveBreakpointByID(bp_sp->GetID()); @@ -7492,7 +7494,7 @@ class SourceFileWindowDelegate : public WindowDelegate { // No breakpoint found on the location, add it. BreakpointSP bp_sp = exe_ctx.GetTargetRef().CreateBreakpoint( nullptr, // Don't limit the breakpoint to certain modules - m_file_sp->GetFileSpec(), // Source file + m_file_sp->GetSupportFile()->GetSpecOnly(), // Source file m_selected_line + 1, // Source line number (m_selected_line is zero based) 0, // No column specified. diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp index 0d70c554e5342b..cd0011a25f1c39 100644 --- a/lldb/source/Core/SourceManager.cpp +++ b/lldb/source/Core/SourceManager.cpp @@ -87,8 +87,10 @@ SourceManager::FileSP SourceManager::GetFile(const FileSpec &file_spec) { LLDB_LOG(log, "Source file caching disabled: creating new source file: {0}", file_spec); if (target_sp) - return std::make_shared(file_spec, target_sp); - return std::make_shared(file_spec, debugger_sp); + return std::make_shared(std::make_shared(file_spec), + target_sp); + return std::make_shared(std::make_shared(file_spec), + debugger_sp); } ProcessSP process_sp = target_sp ? target_sp->GetProcessSP() : ProcessSP(); @@ -136,7 +138,8 @@ SourceManager::FileSP SourceManager::GetFile(const FileSpec &file_spec) { } // Check if the file exists on disk. - if (file_sp && !FileSystem::Instance().Exists(file_sp->GetFileSpec())) { + if (file_sp && !FileSystem::Instance().Exists( + file_sp->GetSupportFile()->GetSpecOnly())) { LLDB_LOG(log, "File doesn't exist on disk: {0}", file_spec); file_sp.reset(); } @@ -148,9 +151,11 @@ SourceManager::FileSP SourceManager::GetFile(const FileSpec &file_spec) { // (Re)create the file. if (target_sp) - file_sp = std::make_shared(file_spec, target_sp); + file_sp = std::make_shared(std::make_shared(file_spec), + target_sp); else - file_sp = std::make_shared(file_spec, debugger_sp); + file_sp = std::make_shared(std::make_shared(file_spec), + debugger_sp); // Add the file to the debugger and process cache. If the file was // invalidated, this will overwrite it. @@ -444,25 +449,25 @@ void SourceManager::FindLinesMatchingRegex(FileSpec &file_spec, match_lines); } -SourceManager::File::File(const FileSpec &file_spec, +SourceManager::File::File(SupportFileSP support_file_sp, lldb::DebuggerSP debugger_sp) - : m_file_spec_orig(file_spec), m_file_spec(), m_mod_time(), + : m_support_file_sp(std::make_shared()), m_mod_time(), m_debugger_wp(debugger_sp), m_target_wp(TargetSP()) { - CommonInitializer(file_spec, {}); + CommonInitializer(support_file_sp, {}); } -SourceManager::File::File(const FileSpec &file_spec, TargetSP target_sp) - : m_file_spec_orig(file_spec), m_file_spec(), m_mod_time(), +SourceManager::File::File(SupportFileSP support_file_sp, TargetSP target_sp) + : m_support_file_sp(std::make_shared()), m_mod_time(), m_debugger_wp(target_sp ? target_sp->GetDebugger().shared_from_this() : DebuggerSP()), m_target_wp(target_sp) { - CommonInitializer(file_spec, target_sp); + CommonInitializer(support_file_sp, target_sp); } -void SourceManager::File::CommonInitializer(const FileSpec &file_spec, +void SourceManager::File::CommonInitializer(SupportFileSP support_file_sp, TargetSP target_sp) { // Set the file and update the modification time. - SetFileSpec(file_spec); + SetSupportFile(support_file_sp); // Always update the source map modification ID if we have a target. if (target_sp) @@ -472,65 +477,76 @@ void SourceManager::File::CommonInitializer(const FileSpec &file_spec, if (m_mod_time == llvm::sys::TimePoint<>()) { if (target_sp) { // If this is just a file name, try finding it in the target. - if (!file_spec.GetDirectory() && file_spec.GetFilename()) { - bool check_inlines = false; - SymbolContextList sc_list; - size_t num_matches = - target_sp->GetImages().ResolveSymbolContextForFilePath( - file_spec.GetFilename().AsCString(), 0, check_inlines, - SymbolContextItem(eSymbolContextModule | - eSymbolContextCompUnit), - sc_list); - bool got_multiple = false; - if (num_matches != 0) { - if (num_matches > 1) { - CompileUnit *test_cu = nullptr; - for (const SymbolContext &sc : sc_list) { - if (sc.comp_unit) { - if (test_cu) { - if (test_cu != sc.comp_unit) - got_multiple = true; - break; - } else - test_cu = sc.comp_unit; + { + FileSpec file_spec = support_file_sp->GetSpecOnly(); + if (!file_spec.GetDirectory() && file_spec.GetFilename()) { + bool check_inlines = false; + SymbolContextList sc_list; + size_t num_matches = + target_sp->GetImages().ResolveSymbolContextForFilePath( + file_spec.GetFilename().AsCString(), 0, check_inlines, + SymbolContextItem(eSymbolContextModule | + eSymbolContextCompUnit), + sc_list); + bool got_multiple = false; + if (num_matches != 0) { + if (num_matches > 1) { + CompileUnit *test_cu = nullptr; + for (const SymbolContext &sc : sc_list) { + if (sc.comp_unit) { + if (test_cu) { + if (test_cu != sc.comp_unit) + got_multiple = true; + break; + } else + test_cu = sc.comp_unit; + } } } - } - if (!got_multiple) { - SymbolContext sc; - sc_list.GetContextAtIndex(0, sc); - if (sc.comp_unit) - SetFileSpec(sc.comp_unit->GetPrimaryFile()); + if (!got_multiple) { + SymbolContext sc; + sc_list.GetContextAtIndex(0, sc); + if (sc.comp_unit) + SetSupportFile(std::make_shared( + sc.comp_unit->GetPrimaryFile())); + } } } } // Try remapping the file if it doesn't exist. - if (!FileSystem::Instance().Exists(m_file_spec)) { - // Check target specific source remappings (i.e., the - // target.source-map setting), then fall back to the module - // specific remapping (i.e., the .dSYM remapping dictionary). - auto remapped = target_sp->GetSourcePathMap().FindFile(m_file_spec); - if (!remapped) { - FileSpec new_spec; - if (target_sp->GetImages().FindSourceFile(m_file_spec, new_spec)) - remapped = new_spec; + { + FileSpec file_spec = support_file_sp->GetSpecOnly(); + if (!FileSystem::Instance().Exists(file_spec)) { + // Check target specific source remappings (i.e., the + // target.source-map setting), then fall back to the module + // specific remapping (i.e., the .dSYM remapping dictionary). + auto remapped = target_sp->GetSourcePathMap().FindFile(file_spec); + if (!remapped) { + FileSpec new_spec; + if (target_sp->GetImages().FindSourceFile(file_spec, new_spec)) + remapped = new_spec; + } + if (remapped) + SetSupportFile(std::make_shared( + *remapped, support_file_sp->GetChecksum())); } - if (remapped) - SetFileSpec(*remapped); } } } // If the file exists, read in the data. if (m_mod_time != llvm::sys::TimePoint<>()) - m_data_sp = FileSystem::Instance().CreateDataBuffer(m_file_spec); + m_data_sp = FileSystem::Instance().CreateDataBuffer( + m_support_file_sp->GetSpecOnly()); } -void SourceManager::File::SetFileSpec(FileSpec file_spec) { +void SourceManager::File::SetSupportFile(lldb::SupportFileSP support_file_sp) { + FileSpec file_spec = support_file_sp->GetSpecOnly(); resolve_tilde(file_spec); - m_file_spec = std::move(file_spec); - m_mod_time = FileSystem::Instance().GetModificationTime(m_file_spec); + m_support_file_sp = + std::make_shared(file_spec, support_file_sp->GetChecksum()); + m_mod_time = FileSystem::Instance().GetModificationTime(file_spec); } uint32_t SourceManager::File::GetLineOffset(uint32_t line) { @@ -603,7 +619,8 @@ bool SourceManager::File::ModificationTimeIsStale() const { // TODO: use host API to sign up for file modifications to anything in our // source cache and only update when we determine a file has been updated. // For now we check each time we want to display info for the file. - auto curr_mod_time = FileSystem::Instance().GetModificationTime(m_file_spec); + auto curr_mod_time = FileSystem::Instance().GetModificationTime( + m_support_file_sp->GetSpecOnly()); return curr_mod_time != llvm::sys::TimePoint<>() && m_mod_time != curr_mod_time; } @@ -644,7 +661,8 @@ size_t SourceManager::File::DisplaySourceLines(uint32_t line, debugger_sp->GetStopShowColumnAnsiSuffix()); HighlighterManager mgr; - std::string path = GetFileSpec().GetPath(/*denormalize*/ false); + std::string path = + GetSupportFile()->GetSpecOnly().GetPath(/*denormalize*/ false); // FIXME: Find a way to get the definitive language this file was written in // and pass it to the highlighter. const auto &h = mgr.getHighlighterFor(lldb::eLanguageTypeUnknown, path); @@ -698,7 +716,8 @@ void SourceManager::File::FindLinesMatchingRegex( bool lldb_private::operator==(const SourceManager::File &lhs, const SourceManager::File &rhs) { - if (lhs.m_file_spec != rhs.m_file_spec) + if (!lhs.GetSupportFile()->Equal(*rhs.GetSupportFile(), + SupportFile::eEqualChecksumIfSet)) return false; return lhs.m_mod_time == rhs.m_mod_time; } @@ -778,9 +797,9 @@ void SourceManager::SourceFileCache::AddSourceFile(const FileSpec &file_spec, assert(file_sp && "invalid FileSP"); AddSourceFileImpl(file_spec, file_sp); - const FileSpec &resolved_file_spec = file_sp->GetFileSpec(); + const FileSpec &resolved_file_spec = file_sp->GetSupportFile()->GetSpecOnly(); if (file_spec != resolved_file_spec) - AddSourceFileImpl(file_sp->GetFileSpec(), file_sp); + AddSourceFileImpl(file_sp->GetSupportFile()->GetSpecOnly(), file_sp); } void SourceManager::SourceFileCache::RemoveSourceFile(const FileSP &file_sp) { diff --git a/lldb/unittests/Core/SourceManagerTest.cpp b/lldb/unittests/Core/SourceManagerTest.cpp index 58d6f6cb3f8503..26ab0edffb398d 100644 --- a/lldb/unittests/Core/SourceManagerTest.cpp +++ b/lldb/unittests/Core/SourceManagerTest.cpp @@ -8,6 +8,7 @@ #include "lldb/Core/SourceManager.h" #include "lldb/Host/FileSystem.h" +#include "lldb/Utility/SupportFile.h" #include "gtest/gtest.h" #include "TestingSupport/MockTildeExpressionResolver.h" @@ -29,8 +30,8 @@ TEST_F(SourceFileCache, FindSourceFileFound) { // Insert: foo FileSpec foo_file_spec("foo"); - auto foo_file_sp = - std::make_shared(foo_file_spec, lldb::DebuggerSP()); + auto foo_file_sp = std::make_shared( + std::make_shared(foo_file_spec), lldb::DebuggerSP()); cache.AddSourceFile(foo_file_spec, foo_file_sp); // Query: foo, expect found. @@ -43,8 +44,8 @@ TEST_F(SourceFileCache, FindSourceFileNotFound) { // Insert: foo FileSpec foo_file_spec("foo"); - auto foo_file_sp = - std::make_shared(foo_file_spec, lldb::DebuggerSP()); + auto foo_file_sp = std::make_shared( + std::make_shared(foo_file_spec), lldb::DebuggerSP()); cache.AddSourceFile(foo_file_spec, foo_file_sp); // Query: bar, expect not found. @@ -63,7 +64,8 @@ TEST_F(SourceFileCache, FindSourceFileByUnresolvedPath) { // Create the file with the resolved file spec. auto foo_file_sp = std::make_shared( - resolved_foo_file_spec, lldb::DebuggerSP()); + std::make_shared(resolved_foo_file_spec), + lldb::DebuggerSP()); // Cache the result with the unresolved file spec. cache.AddSourceFile(foo_file_spec, foo_file_sp); From b0eefb4c4e5136fd606cf4cff566df9dbc0fa051 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 29 Aug 2024 15:32:17 -0700 Subject: [PATCH 36/98] [lldb] Update SupportFile documentation (NFC) --- lldb/include/lldb/Utility/SupportFile.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lldb/include/lldb/Utility/SupportFile.h b/lldb/include/lldb/Utility/SupportFile.h index 334a0aaac2c27e..6a091bb84ada35 100644 --- a/lldb/include/lldb/Utility/SupportFile.h +++ b/lldb/include/lldb/Utility/SupportFile.h @@ -14,10 +14,10 @@ namespace lldb_private { -/// Wraps either a FileSpec that represents a local file or a source -/// file whose contents is known (for example because it can be -/// reconstructed from debug info), but that hasn't been written to a -/// file yet. This also stores an optional checksum of the on-disk content. +/// Wraps a FileSpec and an optional Checksum. The FileSpec represents either a +/// path to a file or a source file whose contents is known (for example because +/// it can be reconstructed from debug info), but that hasn't been written to a +/// file yet. class SupportFile { public: SupportFile() : m_file_spec(), m_checksum() {} From 0c4cf79defe30d43279bf4526cdf32b6c7f8a197 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Thu, 29 Aug 2024 20:57:25 +0200 Subject: [PATCH 37/98] [clang] Install scan-build-py into plain "lib" directory (#106612) Install scan-build-py modules into the plain `lib` directory, without LLVM_LIBDIR_SUFFIX appended, to match the path expected by `intercept-build` executable. This fixes the program being unable to find its modules. Using unsuffixed path makes sense here, since Python modules are not subject to multilib. This change effectively reverts 1334e129a39cb427e7b855e9a711a3e7604e50e5. The commit in question changed the path without a clear justification ("does not respect the given prefix") and the Python code was never modified to actually work with the change. Fixes #106608 --- clang/tools/scan-build-py/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/tools/scan-build-py/CMakeLists.txt b/clang/tools/scan-build-py/CMakeLists.txt index 3aca22c0b0a8d3..9273eb5ed977e4 100644 --- a/clang/tools/scan-build-py/CMakeLists.txt +++ b/clang/tools/scan-build-py/CMakeLists.txt @@ -88,7 +88,7 @@ foreach(lib ${LibScanbuild}) DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/lib/libscanbuild/${lib}) list(APPEND Depends ${CMAKE_BINARY_DIR}/lib/libscanbuild/${lib}) install(FILES lib/libscanbuild/${lib} - DESTINATION lib${CLANG_LIBDIR_SUFFIX}/libscanbuild + DESTINATION lib/libscanbuild COMPONENT scan-build-py) endforeach() @@ -106,7 +106,7 @@ foreach(resource ${LibScanbuildResources}) DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/lib/libscanbuild/resources/${resource}) list(APPEND Depends ${CMAKE_BINARY_DIR}/lib/libscanbuild/resources/${resource}) install(FILES lib/libscanbuild/resources/${resource} - DESTINATION lib${CLANG_LIBDIR_SUFFIX}/libscanbuild/resources + DESTINATION lib/libscanbuild/resources COMPONENT scan-build-py) endforeach() @@ -122,7 +122,7 @@ foreach(lib ${LibEar}) DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/lib/libear/${lib}) list(APPEND Depends ${CMAKE_BINARY_DIR}/lib/libear/${lib}) install(FILES lib/libear/${lib} - DESTINATION lib${CLANG_LIBDIR_SUFFIX}/libear + DESTINATION lib/libear COMPONENT scan-build-py) endforeach() From 369d8148e09c2b91174ec01e845bc504cf622c45 Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Fri, 30 Aug 2024 07:34:49 -0700 Subject: [PATCH 38/98] [ValueTracking] use KnownBits to compute fpclass from bitcast (#97762) When we encounter a bitcast from an integer type we can use the information from `KnownBits` to glean some information about the fpclass: - If the sign bit is known, we can transfer this information over. - If the float is IEEE format and enough of the bits are known, we may be able to prove or rule out some fpclasses such as NaN, Zero, or Inf. --- clang/test/Headers/__clang_hip_math.hip | 571 +++++++++++++------ llvm/lib/Analysis/ValueTracking.cpp | 55 ++ llvm/test/Transforms/Attributor/nofpclass.ll | 280 +++++++++ 3 files changed, 719 insertions(+), 187 deletions(-) diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip index 6ee10976f12079..9d202e0d046822 100644 --- a/clang/test/Headers/__clang_hip_math.hip +++ b/clang/test/Headers/__clang_hip_math.hip @@ -2361,198 +2361,395 @@ extern "C" __device__ double test_modf(double x, double* y) { return modf(x, y); } -// CHECK-LABEL: @test_nanf( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]] -// CHECK: if.then.i.i: -// CHECK-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1 -// CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]] -// CHECK-NEXT: switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [ -// CHECK-NEXT: i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]] -// CHECK-NEXT: i8 88, label [[WHILE_COND_I30_I_I_PREHEADER]] -// CHECK-NEXT: ] -// CHECK: while.cond.i30.i.i.preheader: -// CHECK-NEXT: br label [[WHILE_COND_I30_I_I:%.*]] -// CHECK: while.cond.i30.i.i: -// CHECK-NEXT: [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I30_I_I_PREHEADER]] ] -// CHECK-NEXT: [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[WHILE_COND_I30_I_I_PREHEADER]] ] -// CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]] -// CHECK-NEXT: [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP2]], 0 -// CHECK-NEXT: br i1 [[CMP_NOT_I33_I_I]], label [[_ZL4NANFPKC_EXIT:%.*]], label [[WHILE_BODY_I34_I_I:%.*]] -// CHECK: while.body.i34.i.i: -// CHECK-NEXT: [[TMP3:%.*]] = add i8 [[TMP2]], -48 -// CHECK-NEXT: [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP3]], 10 -// CHECK-NEXT: br i1 [[OR_COND_I35_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]] -// CHECK: if.else.i.i.i: -// CHECK-NEXT: [[TMP4:%.*]] = add i8 [[TMP2]], -97 -// CHECK-NEXT: [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6 -// CHECK-NEXT: br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]] -// CHECK: if.else17.i.i.i: -// CHECK-NEXT: [[TMP5:%.*]] = add i8 [[TMP2]], -65 -// CHECK-NEXT: [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6 -// CHECK-NEXT: br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I36_I_I]] -// CHECK: if.end31.i.i.i: -// CHECK-NEXT: [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I34_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ] -// CHECK-NEXT: [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I32_I_I]], 4 -// CHECK-NEXT: [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64 -// CHECK-NEXT: [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]] -// CHECK-NEXT: [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]] -// CHECK-NEXT: [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1 -// CHECK-NEXT: br label [[CLEANUP_I36_I_I]] -// CHECK: cleanup.i36.i.i: -// CHECK-NEXT: [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ] -// CHECK-NEXT: [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I32_I_I]], [[IF_ELSE17_I_I_I]] ] -// CHECK-NEXT: [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ] -// CHECK-NEXT: br i1 [[COND_I_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP11]] -// CHECK: while.cond.i.i.i: -// CHECK-NEXT: [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ] -// CHECK-NEXT: [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ] -// CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]] -// CHECK-NEXT: [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0 -// CHECK-NEXT: br i1 [[CMP_NOT_I_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]] -// CHECK: while.body.i.i.i: -// CHECK-NEXT: [[TMP7:%.*]] = and i8 [[TMP6]], -8 -// CHECK-NEXT: [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP7]], 48 -// CHECK-NEXT: br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[CLEANUP_I_I_I]] -// CHECK: if.then.i.i.i: -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3 -// CHECK-NEXT: [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64 -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48 -// CHECK-NEXT: [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]] -// CHECK-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1 -// CHECK-NEXT: br label [[CLEANUP_I_I_I]] -// CHECK: cleanup.i.i.i: -// CHECK-NEXT: [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] -// CHECK-NEXT: [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] -// CHECK-NEXT: br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP7]] -// CHECK: while.cond.i14.i.i: -// CHECK-NEXT: [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ] -// CHECK-NEXT: [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[ENTRY]] ] -// CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]] -// CHECK-NEXT: [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP8]], 0 -// CHECK-NEXT: br i1 [[CMP_NOT_I17_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I18_I_I:%.*]] -// CHECK: while.body.i18.i.i: -// CHECK-NEXT: [[TMP9:%.*]] = add i8 [[TMP8]], -48 -// CHECK-NEXT: [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP9]], 10 -// CHECK-NEXT: br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I24_I_I:%.*]], label [[CLEANUP_I20_I_I]] -// CHECK: if.then.i24.i.i: -// CHECK-NEXT: [[MUL_I25_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10 -// CHECK-NEXT: [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64 -// CHECK-NEXT: [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48 -// CHECK-NEXT: [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]] -// CHECK-NEXT: [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1 -// CHECK-NEXT: br label [[CLEANUP_I20_I_I]] -// CHECK: cleanup.i20.i.i: -// CHECK-NEXT: [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ] -// CHECK-NEXT: [[__R_1_I22_I_I]] = phi i64 [ [[SUB_I28_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_BODY_I18_I_I]] ] -// CHECK-NEXT: br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP10]] -// CHECK: _ZL4nanfPKc.exit: -// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ], [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ] -// CHECK-NEXT: [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32 -// CHECK-NEXT: [[BF_VALUE_I:%.*]] = and i32 [[CONV_I]], 4194303 -// CHECK-NEXT: [[BF_SET9_I:%.*]] = or disjoint i32 [[BF_VALUE_I]], 2143289344 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i32 [[BF_SET9_I]] to float -// CHECK-NEXT: ret float [[TMP10]] +// DEFAULT-LABEL: @test_nanf( +// DEFAULT-NEXT: entry: +// DEFAULT-NEXT: [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]] +// DEFAULT-NEXT: [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48 +// DEFAULT-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]] +// DEFAULT: if.then.i.i: +// DEFAULT-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1 +// DEFAULT-NEXT: [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]] +// DEFAULT-NEXT: switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [ +// DEFAULT-NEXT: i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]] +// DEFAULT-NEXT: i8 88, label [[WHILE_COND_I30_I_I_PREHEADER]] +// DEFAULT-NEXT: ] +// DEFAULT: while.cond.i30.i.i.preheader: +// DEFAULT-NEXT: br label [[WHILE_COND_I30_I_I:%.*]] +// DEFAULT: while.cond.i30.i.i: +// DEFAULT-NEXT: [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I30_I_I_PREHEADER]] ] +// DEFAULT-NEXT: [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[WHILE_COND_I30_I_I_PREHEADER]] ] +// DEFAULT-NEXT: [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]] +// DEFAULT-NEXT: [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP2]], 0 +// DEFAULT-NEXT: br i1 [[CMP_NOT_I33_I_I]], label [[_ZL4NANFPKC_EXIT:%.*]], label [[WHILE_BODY_I34_I_I:%.*]] +// DEFAULT: while.body.i34.i.i: +// DEFAULT-NEXT: [[TMP3:%.*]] = add i8 [[TMP2]], -48 +// DEFAULT-NEXT: [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP3]], 10 +// DEFAULT-NEXT: br i1 [[OR_COND_I35_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]] +// DEFAULT: if.else.i.i.i: +// DEFAULT-NEXT: [[TMP4:%.*]] = add i8 [[TMP2]], -97 +// DEFAULT-NEXT: [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6 +// DEFAULT-NEXT: br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]] +// DEFAULT: if.else17.i.i.i: +// DEFAULT-NEXT: [[TMP5:%.*]] = add i8 [[TMP2]], -65 +// DEFAULT-NEXT: [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6 +// DEFAULT-NEXT: br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I36_I_I]] +// DEFAULT: if.end31.i.i.i: +// DEFAULT-NEXT: [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I34_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ] +// DEFAULT-NEXT: [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I32_I_I]], 4 +// DEFAULT-NEXT: [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64 +// DEFAULT-NEXT: [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]] +// DEFAULT-NEXT: [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]] +// DEFAULT-NEXT: [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1 +// DEFAULT-NEXT: br label [[CLEANUP_I36_I_I]] +// DEFAULT: cleanup.i36.i.i: +// DEFAULT-NEXT: [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ] +// DEFAULT-NEXT: [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I32_I_I]], [[IF_ELSE17_I_I_I]] ] +// DEFAULT-NEXT: [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ] +// DEFAULT-NEXT: br i1 [[COND_I_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP11]] +// DEFAULT: while.cond.i.i.i: +// DEFAULT-NEXT: [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ] +// DEFAULT-NEXT: [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ] +// DEFAULT-NEXT: [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]] +// DEFAULT-NEXT: [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0 +// DEFAULT-NEXT: br i1 [[CMP_NOT_I_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]] +// DEFAULT: while.body.i.i.i: +// DEFAULT-NEXT: [[TMP7:%.*]] = and i8 [[TMP6]], -8 +// DEFAULT-NEXT: [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP7]], 48 +// DEFAULT-NEXT: br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[CLEANUP_I_I_I]] +// DEFAULT: if.then.i.i.i: +// DEFAULT-NEXT: [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3 +// DEFAULT-NEXT: [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64 +// DEFAULT-NEXT: [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48 +// DEFAULT-NEXT: [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]] +// DEFAULT-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1 +// DEFAULT-NEXT: br label [[CLEANUP_I_I_I]] +// DEFAULT: cleanup.i.i.i: +// DEFAULT-NEXT: [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] +// DEFAULT-NEXT: [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] +// DEFAULT-NEXT: br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP7]] +// DEFAULT: while.cond.i14.i.i: +// DEFAULT-NEXT: [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ] +// DEFAULT-NEXT: [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[ENTRY]] ] +// DEFAULT-NEXT: [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]] +// DEFAULT-NEXT: [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP8]], 0 +// DEFAULT-NEXT: br i1 [[CMP_NOT_I17_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I18_I_I:%.*]] +// DEFAULT: while.body.i18.i.i: +// DEFAULT-NEXT: [[TMP9:%.*]] = add i8 [[TMP8]], -48 +// DEFAULT-NEXT: [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP9]], 10 +// DEFAULT-NEXT: br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I24_I_I:%.*]], label [[CLEANUP_I20_I_I]] +// DEFAULT: if.then.i24.i.i: +// DEFAULT-NEXT: [[MUL_I25_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10 +// DEFAULT-NEXT: [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64 +// DEFAULT-NEXT: [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48 +// DEFAULT-NEXT: [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]] +// DEFAULT-NEXT: [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1 +// DEFAULT-NEXT: br label [[CLEANUP_I20_I_I]] +// DEFAULT: cleanup.i20.i.i: +// DEFAULT-NEXT: [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ] +// DEFAULT-NEXT: [[__R_1_I22_I_I]] = phi i64 [ [[SUB_I28_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_BODY_I18_I_I]] ] +// DEFAULT-NEXT: br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP10]] +// DEFAULT: _ZL4nanfPKc.exit: +// DEFAULT-NEXT: [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ], [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ] +// DEFAULT-NEXT: [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32 +// DEFAULT-NEXT: [[BF_VALUE_I:%.*]] = and i32 [[CONV_I]], 4194303 +// DEFAULT-NEXT: [[BF_SET9_I:%.*]] = or disjoint i32 [[BF_VALUE_I]], 2143289344 +// DEFAULT-NEXT: [[TMP10:%.*]] = bitcast i32 [[BF_SET9_I]] to float +// DEFAULT-NEXT: ret float [[TMP10]] +// +// FINITEONLY-LABEL: @test_nanf( +// FINITEONLY-NEXT: entry: +// FINITEONLY-NEXT: ret float poison +// +// APPROX-LABEL: @test_nanf( +// APPROX-NEXT: entry: +// APPROX-NEXT: [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]] +// APPROX-NEXT: [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48 +// APPROX-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]] +// APPROX: if.then.i.i: +// APPROX-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1 +// APPROX-NEXT: [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]] +// APPROX-NEXT: switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [ +// APPROX-NEXT: i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]] +// APPROX-NEXT: i8 88, label [[WHILE_COND_I30_I_I_PREHEADER]] +// APPROX-NEXT: ] +// APPROX: while.cond.i30.i.i.preheader: +// APPROX-NEXT: br label [[WHILE_COND_I30_I_I:%.*]] +// APPROX: while.cond.i30.i.i: +// APPROX-NEXT: [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I30_I_I_PREHEADER]] ] +// APPROX-NEXT: [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[WHILE_COND_I30_I_I_PREHEADER]] ] +// APPROX-NEXT: [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]] +// APPROX-NEXT: [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP2]], 0 +// APPROX-NEXT: br i1 [[CMP_NOT_I33_I_I]], label [[_ZL4NANFPKC_EXIT:%.*]], label [[WHILE_BODY_I34_I_I:%.*]] +// APPROX: while.body.i34.i.i: +// APPROX-NEXT: [[TMP3:%.*]] = add i8 [[TMP2]], -48 +// APPROX-NEXT: [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP3]], 10 +// APPROX-NEXT: br i1 [[OR_COND_I35_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]] +// APPROX: if.else.i.i.i: +// APPROX-NEXT: [[TMP4:%.*]] = add i8 [[TMP2]], -97 +// APPROX-NEXT: [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6 +// APPROX-NEXT: br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]] +// APPROX: if.else17.i.i.i: +// APPROX-NEXT: [[TMP5:%.*]] = add i8 [[TMP2]], -65 +// APPROX-NEXT: [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6 +// APPROX-NEXT: br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I36_I_I]] +// APPROX: if.end31.i.i.i: +// APPROX-NEXT: [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I34_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ] +// APPROX-NEXT: [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I32_I_I]], 4 +// APPROX-NEXT: [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64 +// APPROX-NEXT: [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]] +// APPROX-NEXT: [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]] +// APPROX-NEXT: [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1 +// APPROX-NEXT: br label [[CLEANUP_I36_I_I]] +// APPROX: cleanup.i36.i.i: +// APPROX-NEXT: [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ] +// APPROX-NEXT: [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I32_I_I]], [[IF_ELSE17_I_I_I]] ] +// APPROX-NEXT: [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ] +// APPROX-NEXT: br i1 [[COND_I_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP11]] +// APPROX: while.cond.i.i.i: +// APPROX-NEXT: [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ] +// APPROX-NEXT: [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ] +// APPROX-NEXT: [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]] +// APPROX-NEXT: [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0 +// APPROX-NEXT: br i1 [[CMP_NOT_I_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]] +// APPROX: while.body.i.i.i: +// APPROX-NEXT: [[TMP7:%.*]] = and i8 [[TMP6]], -8 +// APPROX-NEXT: [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP7]], 48 +// APPROX-NEXT: br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[CLEANUP_I_I_I]] +// APPROX: if.then.i.i.i: +// APPROX-NEXT: [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3 +// APPROX-NEXT: [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64 +// APPROX-NEXT: [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48 +// APPROX-NEXT: [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]] +// APPROX-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1 +// APPROX-NEXT: br label [[CLEANUP_I_I_I]] +// APPROX: cleanup.i.i.i: +// APPROX-NEXT: [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] +// APPROX-NEXT: [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] +// APPROX-NEXT: br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP7]] +// APPROX: while.cond.i14.i.i: +// APPROX-NEXT: [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ] +// APPROX-NEXT: [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[ENTRY]] ] +// APPROX-NEXT: [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]] +// APPROX-NEXT: [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP8]], 0 +// APPROX-NEXT: br i1 [[CMP_NOT_I17_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I18_I_I:%.*]] +// APPROX: while.body.i18.i.i: +// APPROX-NEXT: [[TMP9:%.*]] = add i8 [[TMP8]], -48 +// APPROX-NEXT: [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP9]], 10 +// APPROX-NEXT: br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I24_I_I:%.*]], label [[CLEANUP_I20_I_I]] +// APPROX: if.then.i24.i.i: +// APPROX-NEXT: [[MUL_I25_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10 +// APPROX-NEXT: [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64 +// APPROX-NEXT: [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48 +// APPROX-NEXT: [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]] +// APPROX-NEXT: [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1 +// APPROX-NEXT: br label [[CLEANUP_I20_I_I]] +// APPROX: cleanup.i20.i.i: +// APPROX-NEXT: [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ] +// APPROX-NEXT: [[__R_1_I22_I_I]] = phi i64 [ [[SUB_I28_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_BODY_I18_I_I]] ] +// APPROX-NEXT: br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP10]] +// APPROX: _ZL4nanfPKc.exit: +// APPROX-NEXT: [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ], [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ] +// APPROX-NEXT: [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32 +// APPROX-NEXT: [[BF_VALUE_I:%.*]] = and i32 [[CONV_I]], 4194303 +// APPROX-NEXT: [[BF_SET9_I:%.*]] = or disjoint i32 [[BF_VALUE_I]], 2143289344 +// APPROX-NEXT: [[TMP10:%.*]] = bitcast i32 [[BF_SET9_I]] to float +// APPROX-NEXT: ret float [[TMP10]] // extern "C" __device__ float test_nanf(const char *tag) { return nanf(tag); } -// CHECK-LABEL: @test_nan( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]] -// CHECK: if.then.i.i: -// CHECK-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1 -// CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]] -// CHECK-NEXT: switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [ -// CHECK-NEXT: i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]] -// CHECK-NEXT: i8 88, label [[WHILE_COND_I30_I_I_PREHEADER]] -// CHECK-NEXT: ] -// CHECK: while.cond.i30.i.i.preheader: -// CHECK-NEXT: br label [[WHILE_COND_I30_I_I:%.*]] -// CHECK: while.cond.i30.i.i: -// CHECK-NEXT: [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I30_I_I_PREHEADER]] ] -// CHECK-NEXT: [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[WHILE_COND_I30_I_I_PREHEADER]] ] -// CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]] -// CHECK-NEXT: [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP2]], 0 -// CHECK-NEXT: br i1 [[CMP_NOT_I33_I_I]], label [[_ZL3NANPKC_EXIT:%.*]], label [[WHILE_BODY_I34_I_I:%.*]] -// CHECK: while.body.i34.i.i: -// CHECK-NEXT: [[TMP3:%.*]] = add i8 [[TMP2]], -48 -// CHECK-NEXT: [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP3]], 10 -// CHECK-NEXT: br i1 [[OR_COND_I35_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]] -// CHECK: if.else.i.i.i: -// CHECK-NEXT: [[TMP4:%.*]] = add i8 [[TMP2]], -97 -// CHECK-NEXT: [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6 -// CHECK-NEXT: br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]] -// CHECK: if.else17.i.i.i: -// CHECK-NEXT: [[TMP5:%.*]] = add i8 [[TMP2]], -65 -// CHECK-NEXT: [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6 -// CHECK-NEXT: br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I36_I_I]] -// CHECK: if.end31.i.i.i: -// CHECK-NEXT: [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I34_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ] -// CHECK-NEXT: [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I32_I_I]], 4 -// CHECK-NEXT: [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64 -// CHECK-NEXT: [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]] -// CHECK-NEXT: [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]] -// CHECK-NEXT: [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1 -// CHECK-NEXT: br label [[CLEANUP_I36_I_I]] -// CHECK: cleanup.i36.i.i: -// CHECK-NEXT: [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ] -// CHECK-NEXT: [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I32_I_I]], [[IF_ELSE17_I_I_I]] ] -// CHECK-NEXT: [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ] -// CHECK-NEXT: br i1 [[COND_I_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP11]] -// CHECK: while.cond.i.i.i: -// CHECK-NEXT: [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ] -// CHECK-NEXT: [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ] -// CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]] -// CHECK-NEXT: [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0 -// CHECK-NEXT: br i1 [[CMP_NOT_I_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]] -// CHECK: while.body.i.i.i: -// CHECK-NEXT: [[TMP7:%.*]] = and i8 [[TMP6]], -8 -// CHECK-NEXT: [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP7]], 48 -// CHECK-NEXT: br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[CLEANUP_I_I_I]] -// CHECK: if.then.i.i.i: -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3 -// CHECK-NEXT: [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64 -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48 -// CHECK-NEXT: [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]] -// CHECK-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1 -// CHECK-NEXT: br label [[CLEANUP_I_I_I]] -// CHECK: cleanup.i.i.i: -// CHECK-NEXT: [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] -// CHECK-NEXT: [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] -// CHECK-NEXT: br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP7]] -// CHECK: while.cond.i14.i.i: -// CHECK-NEXT: [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ] -// CHECK-NEXT: [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[ENTRY]] ] -// CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]] -// CHECK-NEXT: [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP8]], 0 -// CHECK-NEXT: br i1 [[CMP_NOT_I17_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I18_I_I:%.*]] -// CHECK: while.body.i18.i.i: -// CHECK-NEXT: [[TMP9:%.*]] = add i8 [[TMP8]], -48 -// CHECK-NEXT: [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP9]], 10 -// CHECK-NEXT: br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I24_I_I:%.*]], label [[CLEANUP_I20_I_I]] -// CHECK: if.then.i24.i.i: -// CHECK-NEXT: [[MUL_I25_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10 -// CHECK-NEXT: [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64 -// CHECK-NEXT: [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48 -// CHECK-NEXT: [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]] -// CHECK-NEXT: [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1 -// CHECK-NEXT: br label [[CLEANUP_I20_I_I]] -// CHECK: cleanup.i20.i.i: -// CHECK-NEXT: [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ] -// CHECK-NEXT: [[__R_1_I22_I_I]] = phi i64 [ [[SUB_I28_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_BODY_I18_I_I]] ] -// CHECK-NEXT: br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP10]] -// CHECK: _ZL3nanPKc.exit: -// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ], [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ] -// CHECK-NEXT: [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247 -// CHECK-NEXT: [[BF_SET9_I:%.*]] = or disjoint i64 [[BF_VALUE_I]], 9221120237041090560 -// CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[BF_SET9_I]] to double -// CHECK-NEXT: ret double [[TMP10]] +// DEFAULT-LABEL: @test_nan( +// DEFAULT-NEXT: entry: +// DEFAULT-NEXT: [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]] +// DEFAULT-NEXT: [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48 +// DEFAULT-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]] +// DEFAULT: if.then.i.i: +// DEFAULT-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1 +// DEFAULT-NEXT: [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]] +// DEFAULT-NEXT: switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [ +// DEFAULT-NEXT: i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]] +// DEFAULT-NEXT: i8 88, label [[WHILE_COND_I30_I_I_PREHEADER]] +// DEFAULT-NEXT: ] +// DEFAULT: while.cond.i30.i.i.preheader: +// DEFAULT-NEXT: br label [[WHILE_COND_I30_I_I:%.*]] +// DEFAULT: while.cond.i30.i.i: +// DEFAULT-NEXT: [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I30_I_I_PREHEADER]] ] +// DEFAULT-NEXT: [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[WHILE_COND_I30_I_I_PREHEADER]] ] +// DEFAULT-NEXT: [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]] +// DEFAULT-NEXT: [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP2]], 0 +// DEFAULT-NEXT: br i1 [[CMP_NOT_I33_I_I]], label [[_ZL3NANPKC_EXIT:%.*]], label [[WHILE_BODY_I34_I_I:%.*]] +// DEFAULT: while.body.i34.i.i: +// DEFAULT-NEXT: [[TMP3:%.*]] = add i8 [[TMP2]], -48 +// DEFAULT-NEXT: [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP3]], 10 +// DEFAULT-NEXT: br i1 [[OR_COND_I35_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]] +// DEFAULT: if.else.i.i.i: +// DEFAULT-NEXT: [[TMP4:%.*]] = add i8 [[TMP2]], -97 +// DEFAULT-NEXT: [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6 +// DEFAULT-NEXT: br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]] +// DEFAULT: if.else17.i.i.i: +// DEFAULT-NEXT: [[TMP5:%.*]] = add i8 [[TMP2]], -65 +// DEFAULT-NEXT: [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6 +// DEFAULT-NEXT: br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I36_I_I]] +// DEFAULT: if.end31.i.i.i: +// DEFAULT-NEXT: [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I34_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ] +// DEFAULT-NEXT: [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I32_I_I]], 4 +// DEFAULT-NEXT: [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64 +// DEFAULT-NEXT: [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]] +// DEFAULT-NEXT: [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]] +// DEFAULT-NEXT: [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1 +// DEFAULT-NEXT: br label [[CLEANUP_I36_I_I]] +// DEFAULT: cleanup.i36.i.i: +// DEFAULT-NEXT: [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ] +// DEFAULT-NEXT: [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I32_I_I]], [[IF_ELSE17_I_I_I]] ] +// DEFAULT-NEXT: [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ] +// DEFAULT-NEXT: br i1 [[COND_I_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP11]] +// DEFAULT: while.cond.i.i.i: +// DEFAULT-NEXT: [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ] +// DEFAULT-NEXT: [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ] +// DEFAULT-NEXT: [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]] +// DEFAULT-NEXT: [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0 +// DEFAULT-NEXT: br i1 [[CMP_NOT_I_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]] +// DEFAULT: while.body.i.i.i: +// DEFAULT-NEXT: [[TMP7:%.*]] = and i8 [[TMP6]], -8 +// DEFAULT-NEXT: [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP7]], 48 +// DEFAULT-NEXT: br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[CLEANUP_I_I_I]] +// DEFAULT: if.then.i.i.i: +// DEFAULT-NEXT: [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3 +// DEFAULT-NEXT: [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64 +// DEFAULT-NEXT: [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48 +// DEFAULT-NEXT: [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]] +// DEFAULT-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1 +// DEFAULT-NEXT: br label [[CLEANUP_I_I_I]] +// DEFAULT: cleanup.i.i.i: +// DEFAULT-NEXT: [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] +// DEFAULT-NEXT: [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] +// DEFAULT-NEXT: br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP7]] +// DEFAULT: while.cond.i14.i.i: +// DEFAULT-NEXT: [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ] +// DEFAULT-NEXT: [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[ENTRY]] ] +// DEFAULT-NEXT: [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]] +// DEFAULT-NEXT: [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP8]], 0 +// DEFAULT-NEXT: br i1 [[CMP_NOT_I17_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I18_I_I:%.*]] +// DEFAULT: while.body.i18.i.i: +// DEFAULT-NEXT: [[TMP9:%.*]] = add i8 [[TMP8]], -48 +// DEFAULT-NEXT: [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP9]], 10 +// DEFAULT-NEXT: br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I24_I_I:%.*]], label [[CLEANUP_I20_I_I]] +// DEFAULT: if.then.i24.i.i: +// DEFAULT-NEXT: [[MUL_I25_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10 +// DEFAULT-NEXT: [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64 +// DEFAULT-NEXT: [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48 +// DEFAULT-NEXT: [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]] +// DEFAULT-NEXT: [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1 +// DEFAULT-NEXT: br label [[CLEANUP_I20_I_I]] +// DEFAULT: cleanup.i20.i.i: +// DEFAULT-NEXT: [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ] +// DEFAULT-NEXT: [[__R_1_I22_I_I]] = phi i64 [ [[SUB_I28_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_BODY_I18_I_I]] ] +// DEFAULT-NEXT: br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP10]] +// DEFAULT: _ZL3nanPKc.exit: +// DEFAULT-NEXT: [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ], [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ] +// DEFAULT-NEXT: [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247 +// DEFAULT-NEXT: [[BF_SET9_I:%.*]] = or disjoint i64 [[BF_VALUE_I]], 9221120237041090560 +// DEFAULT-NEXT: [[TMP10:%.*]] = bitcast i64 [[BF_SET9_I]] to double +// DEFAULT-NEXT: ret double [[TMP10]] +// +// FINITEONLY-LABEL: @test_nan( +// FINITEONLY-NEXT: entry: +// FINITEONLY-NEXT: ret double poison +// +// APPROX-LABEL: @test_nan( +// APPROX-NEXT: entry: +// APPROX-NEXT: [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]] +// APPROX-NEXT: [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48 +// APPROX-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]] +// APPROX: if.then.i.i: +// APPROX-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1 +// APPROX-NEXT: [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]] +// APPROX-NEXT: switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [ +// APPROX-NEXT: i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]] +// APPROX-NEXT: i8 88, label [[WHILE_COND_I30_I_I_PREHEADER]] +// APPROX-NEXT: ] +// APPROX: while.cond.i30.i.i.preheader: +// APPROX-NEXT: br label [[WHILE_COND_I30_I_I:%.*]] +// APPROX: while.cond.i30.i.i: +// APPROX-NEXT: [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I30_I_I_PREHEADER]] ] +// APPROX-NEXT: [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[WHILE_COND_I30_I_I_PREHEADER]] ] +// APPROX-NEXT: [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]] +// APPROX-NEXT: [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP2]], 0 +// APPROX-NEXT: br i1 [[CMP_NOT_I33_I_I]], label [[_ZL3NANPKC_EXIT:%.*]], label [[WHILE_BODY_I34_I_I:%.*]] +// APPROX: while.body.i34.i.i: +// APPROX-NEXT: [[TMP3:%.*]] = add i8 [[TMP2]], -48 +// APPROX-NEXT: [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP3]], 10 +// APPROX-NEXT: br i1 [[OR_COND_I35_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]] +// APPROX: if.else.i.i.i: +// APPROX-NEXT: [[TMP4:%.*]] = add i8 [[TMP2]], -97 +// APPROX-NEXT: [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6 +// APPROX-NEXT: br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]] +// APPROX: if.else17.i.i.i: +// APPROX-NEXT: [[TMP5:%.*]] = add i8 [[TMP2]], -65 +// APPROX-NEXT: [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6 +// APPROX-NEXT: br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I36_I_I]] +// APPROX: if.end31.i.i.i: +// APPROX-NEXT: [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I34_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ] +// APPROX-NEXT: [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I32_I_I]], 4 +// APPROX-NEXT: [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64 +// APPROX-NEXT: [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]] +// APPROX-NEXT: [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]] +// APPROX-NEXT: [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1 +// APPROX-NEXT: br label [[CLEANUP_I36_I_I]] +// APPROX: cleanup.i36.i.i: +// APPROX-NEXT: [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ] +// APPROX-NEXT: [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I32_I_I]], [[IF_ELSE17_I_I_I]] ] +// APPROX-NEXT: [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ] +// APPROX-NEXT: br i1 [[COND_I_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP11]] +// APPROX: while.cond.i.i.i: +// APPROX-NEXT: [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ] +// APPROX-NEXT: [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ] +// APPROX-NEXT: [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]] +// APPROX-NEXT: [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0 +// APPROX-NEXT: br i1 [[CMP_NOT_I_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]] +// APPROX: while.body.i.i.i: +// APPROX-NEXT: [[TMP7:%.*]] = and i8 [[TMP6]], -8 +// APPROX-NEXT: [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP7]], 48 +// APPROX-NEXT: br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[CLEANUP_I_I_I]] +// APPROX: if.then.i.i.i: +// APPROX-NEXT: [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3 +// APPROX-NEXT: [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64 +// APPROX-NEXT: [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48 +// APPROX-NEXT: [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]] +// APPROX-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1 +// APPROX-NEXT: br label [[CLEANUP_I_I_I]] +// APPROX: cleanup.i.i.i: +// APPROX-NEXT: [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] +// APPROX-NEXT: [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_BODY_I_I_I]] ] +// APPROX-NEXT: br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP7]] +// APPROX: while.cond.i14.i.i: +// APPROX-NEXT: [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ] +// APPROX-NEXT: [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[ENTRY]] ] +// APPROX-NEXT: [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]] +// APPROX-NEXT: [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP8]], 0 +// APPROX-NEXT: br i1 [[CMP_NOT_I17_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I18_I_I:%.*]] +// APPROX: while.body.i18.i.i: +// APPROX-NEXT: [[TMP9:%.*]] = add i8 [[TMP8]], -48 +// APPROX-NEXT: [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP9]], 10 +// APPROX-NEXT: br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I24_I_I:%.*]], label [[CLEANUP_I20_I_I]] +// APPROX: if.then.i24.i.i: +// APPROX-NEXT: [[MUL_I25_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10 +// APPROX-NEXT: [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64 +// APPROX-NEXT: [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48 +// APPROX-NEXT: [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]] +// APPROX-NEXT: [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1 +// APPROX-NEXT: br label [[CLEANUP_I20_I_I]] +// APPROX: cleanup.i20.i.i: +// APPROX-NEXT: [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ] +// APPROX-NEXT: [[__R_1_I22_I_I]] = phi i64 [ [[SUB_I28_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_BODY_I18_I_I]] ] +// APPROX-NEXT: br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP10]] +// APPROX: _ZL3nanPKc.exit: +// APPROX-NEXT: [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ], [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ] +// APPROX-NEXT: [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247 +// APPROX-NEXT: [[BF_SET9_I:%.*]] = or disjoint i64 [[BF_VALUE_I]], 9221120237041090560 +// APPROX-NEXT: [[TMP10:%.*]] = bitcast i64 [[BF_SET9_I]] to double +// APPROX-NEXT: ret double [[TMP10]] // extern "C" __device__ double test_nan(const char *tag) { return nan(tag); diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 173faa32a3878d..533fe62fb8cdd6 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -5921,6 +5921,61 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts, break; } + case Instruction::BitCast: { + const Value *Src; + if (!match(Op, m_ElementWiseBitCast(m_Value(Src))) || + !Src->getType()->isIntOrIntVectorTy()) + break; + + const Type *Ty = Op->getType()->getScalarType(); + KnownBits Bits(Ty->getScalarSizeInBits()); + computeKnownBits(Src, DemandedElts, Bits, Depth + 1, Q); + + // Transfer information from the sign bit. + if (Bits.isNonNegative()) + Known.signBitMustBeZero(); + else if (Bits.isNegative()) + Known.signBitMustBeOne(); + + if (Ty->isIEEE()) { + // IEEE floats are NaN when all bits of the exponent plus at least one of + // the fraction bits are 1. This means: + // - If we assume unknown bits are 0 and the value is NaN, it will + // always be NaN + // - If we assume unknown bits are 1 and the value is not NaN, it can + // never be NaN + if (APFloat(Ty->getFltSemantics(), Bits.One).isNaN()) + Known.KnownFPClasses = fcNan; + else if (!APFloat(Ty->getFltSemantics(), ~Bits.Zero).isNaN()) + Known.knownNot(fcNan); + + // Build KnownBits representing Inf and check if it must be equal or + // unequal to this value. + auto InfKB = KnownBits::makeConstant( + APFloat::getInf(Ty->getFltSemantics()).bitcastToAPInt()); + InfKB.Zero.clearSignBit(); + if (const auto InfResult = KnownBits::eq(Bits, InfKB)) { + assert(!InfResult.value()); + Known.knownNot(fcInf); + } else if (Bits == InfKB) { + Known.KnownFPClasses = fcInf; + } + + // Build KnownBits representing Zero and check if it must be equal or + // unequal to this value. + auto ZeroKB = KnownBits::makeConstant( + APFloat::getZero(Ty->getFltSemantics()).bitcastToAPInt()); + ZeroKB.Zero.clearSignBit(); + if (const auto ZeroResult = KnownBits::eq(Bits, ZeroKB)) { + assert(!ZeroResult.value()); + Known.knownNot(fcZero); + } else if (Bits == ZeroKB) { + Known.KnownFPClasses = fcZero; + } + } + + break; + } default: break; } diff --git a/llvm/test/Transforms/Attributor/nofpclass.ll b/llvm/test/Transforms/Attributor/nofpclass.ll index 781ba636c3ab3c..2a6780b60211cf 100644 --- a/llvm/test/Transforms/Attributor/nofpclass.ll +++ b/llvm/test/Transforms/Attributor/nofpclass.ll @@ -2685,11 +2685,291 @@ define @scalable_splat_zero() { ; See https://github.com/llvm/llvm-project/issues/78507 define double @call_abs(double noundef %__x) { +; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; TUNIT-LABEL: define noundef nofpclass(ninf nzero nsub nnorm) double @call_abs +; TUNIT-SAME: (double noundef [[__X:%.*]]) #[[ATTR3]] { +; TUNIT-NEXT: entry: +; TUNIT-NEXT: [[ABS:%.*]] = tail call noundef nofpclass(ninf nzero nsub nnorm) double @llvm.fabs.f64(double noundef [[__X]]) #[[ATTR22]] +; TUNIT-NEXT: ret double [[ABS]] +; +; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CGSCC-LABEL: define noundef nofpclass(ninf nzero nsub nnorm) double @call_abs +; CGSCC-SAME: (double noundef [[__X:%.*]]) #[[ATTR3]] { +; CGSCC-NEXT: entry: +; CGSCC-NEXT: [[ABS:%.*]] = tail call noundef nofpclass(ninf nzero nsub nnorm) double @llvm.fabs.f64(double noundef [[__X]]) #[[ATTR19]] +; CGSCC-NEXT: ret double [[ABS]] +; entry: %abs = tail call double @llvm.fabs.f64(double %__x) ret double %abs } +define float @bitcast_to_float_sign_0(i32 %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @bitcast_to_float_sign_0 +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ARG]], 1 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i32 [[SHR]] to float +; CHECK-NEXT: ret float [[CAST]] +; + %shr = lshr i32 %arg, 1 + %cast = bitcast i32 %shr to float + ret float %cast +} + +define float @bitcast_to_float_nnan(i32 %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(nan inf nzero nsub nnorm) float @bitcast_to_float_nnan +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ARG]], 2 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i32 [[SHR]] to float +; CHECK-NEXT: ret float [[CAST]] +; + %shr = lshr i32 %arg, 2 + %cast = bitcast i32 %shr to float + ret float %cast +} + +define float @bitcast_to_float_sign_1(i32 %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @bitcast_to_float_sign_1 +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[OR:%.*]] = or i32 [[ARG]], -2147483648 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i32 [[OR]] to float +; CHECK-NEXT: ret float [[CAST]] +; + %or = or i32 %arg, -2147483648 + %cast = bitcast i32 %or to float + ret float %cast +} + +define float @bitcast_to_float_nan(i32 %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(inf zero sub norm) float @bitcast_to_float_nan +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[OR:%.*]] = or i32 [[ARG]], 2139095041 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i32 [[OR]] to float +; CHECK-NEXT: ret float [[CAST]] +; + %or = or i32 %arg, 2139095041 + %cast = bitcast i32 %or to float + ret float %cast +} + +define float @bitcast_to_float_zero(i32 %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(nan inf sub norm) float @bitcast_to_float_zero +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[ARG]], 31 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i32 [[SHL]] to float +; CHECK-NEXT: ret float [[CAST]] +; + %shl = shl i32 %arg, 31 + %cast = bitcast i32 %shl to float + ret float %cast +} + +define float @bitcast_to_float_nzero(i32 %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(zero) float @bitcast_to_float_nzero +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[OR:%.*]] = or i32 [[ARG]], 134217728 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i32 [[OR]] to float +; CHECK-NEXT: ret float [[CAST]] +; + %or = or i32 %arg, 134217728 + %cast = bitcast i32 %or to float + ret float %cast +} + +define float @bitcast_to_float_inf(i32 %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(nan zero sub norm) float @bitcast_to_float_inf +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[SHR:%.*]] = shl i32 [[ARG]], 31 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], 2139095040 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i32 [[OR]] to float +; CHECK-NEXT: ret float [[CAST]] +; + %shr = shl i32 %arg, 31 + %or = or i32 %shr, 2139095040 + %cast = bitcast i32 %or to float + ret float %cast +} + +define double @bitcast_to_double_sign_0(i64 %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) double @bitcast_to_double_sign_0 +; CHECK-SAME: (i64 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[SHR:%.*]] = lshr i64 [[ARG]], 1 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i64 [[SHR]] to double +; CHECK-NEXT: ret double [[CAST]] +; + %shr = lshr i64 %arg, 1 + %cast = bitcast i64 %shr to double + ret double %cast +} + +define double @bitcast_to_double_nnan(i64 %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(nan inf nzero nsub nnorm) double @bitcast_to_double_nnan +; CHECK-SAME: (i64 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[SHR:%.*]] = lshr i64 [[ARG]], 2 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i64 [[SHR]] to double +; CHECK-NEXT: ret double [[CAST]] +; + %shr = lshr i64 %arg, 2 + %cast = bitcast i64 %shr to double + ret double %cast +} + +define double @bitcast_to_double_sign_1(i64 %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) double @bitcast_to_double_sign_1 +; CHECK-SAME: (i64 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[OR:%.*]] = or i64 [[ARG]], -9223372036854775808 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i64 [[OR]] to double +; CHECK-NEXT: ret double [[CAST]] +; + %or = or i64 %arg, -9223372036854775808 + %cast = bitcast i64 %or to double + ret double %cast +} + +define double @bitcast_to_double_nan(i64 %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(inf zero sub norm) double @bitcast_to_double_nan +; CHECK-SAME: (i64 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[OR:%.*]] = or i64 [[ARG]], -4503599627370495 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i64 [[OR]] to double +; CHECK-NEXT: ret double [[CAST]] +; + %or = or i64 %arg, -4503599627370495 + %cast = bitcast i64 %or to double + ret double %cast +} + + +define double @bitcast_to_double_zero(i64 %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(nan inf sub norm) double @bitcast_to_double_zero +; CHECK-SAME: (i64 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[SHL:%.*]] = shl i64 [[ARG]], 63 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i64 [[SHL]] to double +; CHECK-NEXT: ret double [[CAST]] +; + %shl = shl i64 %arg, 63 + %cast = bitcast i64 %shl to double + ret double %cast +} + +define double @bitcast_to_double_nzero(i64 %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(zero) double @bitcast_to_double_nzero +; CHECK-SAME: (i64 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[OR:%.*]] = or i64 [[ARG]], 1152921504606846976 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i64 [[OR]] to double +; CHECK-NEXT: ret double [[CAST]] +; + %or = or i64 %arg, 1152921504606846976 + %cast = bitcast i64 %or to double + ret double %cast +} + +define double @bitcast_to_double_inf(i64 %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(nan zero sub norm) double @bitcast_to_double_inf +; CHECK-SAME: (i64 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[SHR:%.*]] = shl i64 [[ARG]], 63 +; CHECK-NEXT: [[OR:%.*]] = or i64 [[SHR]], 9218868437227405312 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i64 [[OR]] to double +; CHECK-NEXT: ret double [[CAST]] +; + %shr = shl i64 %arg, 63 + %or = or i64 %shr, 9218868437227405312 + %cast = bitcast i64 %or to double + ret double %cast +} + + +define <2 x float> @bitcast_to_float_vect_sign_0(<2 x i32> %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) <2 x float> @bitcast_to_float_vect_sign_0 +; CHECK-SAME: (<2 x i32> [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[SHR:%.*]] = lshr <2 x i32> [[ARG]], +; CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i32> [[SHR]] to <2 x float> +; CHECK-NEXT: ret <2 x float> [[CAST]] +; + %shr = lshr <2 x i32> %arg, + %cast = bitcast <2 x i32> %shr to <2 x float> + ret <2 x float> %cast +} + +define <2 x float> @bitcast_to_float_vect_nnan(<2 x i32> %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(nan inf nzero nsub nnorm) <2 x float> @bitcast_to_float_vect_nnan +; CHECK-SAME: (<2 x i32> [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[SHR:%.*]] = lshr <2 x i32> [[ARG]], +; CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i32> [[SHR]] to <2 x float> +; CHECK-NEXT: ret <2 x float> [[CAST]] +; + %shr = lshr <2 x i32> %arg, + %cast = bitcast <2 x i32> %shr to <2 x float> + ret <2 x float> %cast +} + +define <2 x float> @bitcast_to_float_vect_sign_1(<2 x i32> %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) <2 x float> @bitcast_to_float_vect_sign_1 +; CHECK-SAME: (<2 x i32> [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[ARG]], +; CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i32> [[OR]] to <2 x float> +; CHECK-NEXT: ret <2 x float> [[CAST]] +; + %or = or <2 x i32> %arg, + %cast = bitcast <2 x i32> %or to <2 x float> + ret <2 x float> %cast +} + +define <2 x float> @bitcast_to_float_vect_nan(<2 x i32> %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define nofpclass(inf zero sub norm) <2 x float> @bitcast_to_float_vect_nan +; CHECK-SAME: (<2 x i32> [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[ARG]], +; CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i32> [[OR]] to <2 x float> +; CHECK-NEXT: ret <2 x float> [[CAST]] +; + %or = or <2 x i32> %arg, + %cast = bitcast <2 x i32> %or to <2 x float> + ret <2 x float> %cast +} + +define <2 x float> @bitcast_to_float_vect_conservative_1(<2 x i32> %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define <2 x float> @bitcast_to_float_vect_conservative_1 +; CHECK-SAME: (<2 x i32> [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[ARG]], +; CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i32> [[OR]] to <2 x float> +; CHECK-NEXT: ret <2 x float> [[CAST]] +; + %or = or <2 x i32> %arg, + %cast = bitcast <2 x i32> %or to <2 x float> + ret <2 x float> %cast +} + +define <2 x float> @bitcast_to_float_vect_conservative_2(<2 x i32> %arg) { +; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CHECK-LABEL: define <2 x float> @bitcast_to_float_vect_conservative_2 +; CHECK-SAME: (<2 x i32> [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[ARG]], +; CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i32> [[OR]] to <2 x float> +; CHECK-NEXT: ret <2 x float> [[CAST]] +; + %or = or <2 x i32> %arg, + %cast = bitcast <2 x i32> %or to <2 x float> + ret <2 x float> %cast +} + declare i64 @_Z13get_global_idj(i32 noundef) attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" } From c4a53811c18b02490cbdc65be494e49018e23900 Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Fri, 30 Aug 2024 15:36:47 +0100 Subject: [PATCH 39/98] [test-suite] Document the LLVM test-suite benchmark apps (#105843) There is no documentation or description of the different apps in the LLVM benchmark test-suite and this is a first attempt to document this for the MultiSource apps. --- llvm/docs/TestSuiteGuide.md | 38 +++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/llvm/docs/TestSuiteGuide.md b/llvm/docs/TestSuiteGuide.md index 9552cd89aa1c1b..19db0ee7d01b82 100644 --- a/llvm/docs/TestSuiteGuide.md +++ b/llvm/docs/TestSuiteGuide.md @@ -134,6 +134,44 @@ Every program can work as a correctness test. Some programs are unsuitable for performance measurements. Setting the `TEST_SUITE_BENCHMARKING_ONLY` CMake option to `ON` will disable them. +The MultiSource benchmarks consist of the following apps and benchmarks: + +| MultiSource | Language | Application Area | Remark | +|----------------------|-----------|-------------------------------|----------------------| +| 7zip | C/C++ | Compression/Decompression | | +| ASCI_Purple | C | SMG2000 benchmark and solver | Memory intensive app | +| ASC_Sequoia | C | Simulation and solver | | +| BitBench | C | uudecode/uuencode utility | Bit Stream benchmark for functional compilers | +| Bullet | C++ | Bullet 2.75 physics engine | | +| DOE-ProxyApps-C++ | C++ | HPC/scientific apps | Small applications, representative of our larger DOE workloads | +| DOE-ProxyApps-C | C | HPC/scientific apps | " | +| Fhourstones | C | Game/solver | Integer benchmark that efficiently solves positions in the game of Connect-4 | +| Fhourstones-3.1 | C | Game/solver | " | +| FreeBench | C | Benchmark suite | Raytracer, four in a row, neural network, file compressor, Fast Fourier/Cosine/Sine Transform | +| llubenchmark | C | Linked-list micro-benchmark | | +| mafft | C | Bioinformatics | A multiple sequence alignment program | +| MallocBench | C | Benchmark suite | cfrac, espresso, gawk, gs, make, p2c, perl | +| McCat | C | Benchmark suite | Quicksort, bubblesort, eigenvalues | +| mediabench | C | Benchmark suite | adpcm, g721, gsm, jpeg, mpeg2 | +| MiBench | C | Embedded benchmark suite | Automotive, consumer, office, security, telecom apps | +| nbench | C | | BYTE Magazine's BYTEmark benchmark program | +| NPB-serial | C | Parallel computing | Serial version of the NPB IS code | +| Olden | C | Data Structures | SGI version of the Olden benchmark | +| OptimizerEval | C | Solver | Preston Brigg's optimizer evaluation framework | +| PAQ8p | C++ | Data compression | | +| Prolangs-C++ | C++ | Benchmark suite | city, employ, life, NP, ocean, primes, simul, vcirc | +| Prolangs-C | C | Benchmark suite | agrep, archie-client, bison, gnugo, unix-smail | +| Ptrdist | C | Pointer-Intensive Benchmark Suite | | +| Rodinia | C | Scientific apps | backprop, pathfinder, srad | +| SciMark2-C | C | Scientific apps | FFT, LU, Montecarlo, sparse matmul | +| sim | C | Dynamic programming | A Time-Efficient, Linear-Space Local Similarity Algorithm | +| tramp3d-v4 | C++ | Numerical analysis | Template-intensive numerical program based on FreePOOMA | +| Trimaran | C | Encryption | 3des, md5, crc | +| TSVC | C | Vectorization benchmark | Test Suite for Vectorizing Compilers (TSVC) | +| VersaBench | C | Benchmark suite | 8b10b, beamformer, bmm, dbms, ecbdes | + +All MultiSource applications are suitable for performance measurements +and will run when CMake option `TEST_SUITE_BENCHMARKING_ONLY` is set. Configuration ------------- From ece6566048086cf2870d2c2bff46384df1b9e531 Mon Sep 17 00:00:00 2001 From: Pradeep Kumar Date: Fri, 30 Aug 2024 20:08:13 +0530 Subject: [PATCH 40/98] [MLIR][NVVM] Add support for fence.proxy.{acquire, release} Ops (#106689) --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 79 ++++++++++++++++++- mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 28 +++++++ .../Dialect/NVVM/NVVMToLLVMIRTranslation.cpp | 34 ++++++++ mlir/test/Target/LLVMIR/nvvmir-invalid.mlir | 33 ++++++++ mlir/test/Target/LLVMIR/nvvmir.mlir | 37 +++++++++ 5 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 mlir/test/Target/LLVMIR/nvvmir-invalid.mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 4d48b3de7a57ed..709dd922b8fa2f 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -19,6 +19,7 @@ include "mlir/Dialect/LLVMIR/LLVMOpBase.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td" +def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>; def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>; def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>; @@ -531,8 +532,10 @@ def ProxyAlias : I32EnumAttrCase<"alias", 0, "alias">; def ProxyAsync : I32EnumAttrCase<"async", 1, "async">; def ProxyAsyncGlobal : I32EnumAttrCase<"async_global", 2, "async.global">; def ProxyAsyncShared : I32EnumAttrCase<"async_shared", 3, "async.shared">; +def ProxyTensorMap : I32EnumAttrCase<"TENSORMAP", 4, "tensormap">; +def ProxyGeneric : I32EnumAttrCase<"GENERIC", 5, "generic">; def ProxyKind : I32EnumAttr<"ProxyKind", "Proxy kind", - [ProxyAlias, ProxyAsync, ProxyAsyncGlobal, ProxyAsyncShared]> { + [ProxyAlias, ProxyAsync, ProxyAsyncGlobal, ProxyAsyncShared, ProxyTensorMap, ProxyGeneric]> { let genSpecializedAttr = 0; let cppNamespace = "::mlir::NVVM"; } @@ -565,6 +568,80 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">, let hasVerifier = 1; } +// Attrs describing the scope of the Memory Operation +def MemScopeKindCTA : I32EnumAttrCase<"CTA", 0, "cta">; +def MemScopeKindCluster : I32EnumAttrCase<"CLUSTER", 1, "cluster">; +def MemScopeKindGPU : I32EnumAttrCase<"GPU", 2, "gpu">; +def MemScopeKindSYS : I32EnumAttrCase<"SYS", 3, "sys">; + +def MemScopeKind : I32EnumAttr<"MemScopeKind", "NVVM Memory Scope kind", + [MemScopeKindCTA, MemScopeKindCluster, MemScopeKindGPU, MemScopeKindSYS]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::NVVM"; +} +def MemScopeKindAttr : EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + +def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">, + Arguments<(ins MemScopeKindAttr:$scope, LLVM_PointerGeneric:$addr, I32:$size, + DefaultValuedAttr:$fromProxy, + DefaultValuedAttr:$toProxy)> { + let summary = "Uni-directional proxy fence operation with acquire semantics"; + let description = [{ + `fence.proxy.acquire` is a uni-directional fence used to establish ordering + between a prior memory access performed via the generic proxy and a + subsequent memory access performed via the tensormap proxy + + The address operand `addr` and the operand `size` together specify the + memory range `[addr, addr+size)` on which the ordering guarantees on the + memory accesses across the proxies is to be provided. The only supported + value for the `size` operand is 128 and must be an immediate. Generic Addressing + is used unconditionally, and the address specified by the operand `addr` must + fall within the `.global` state space. Otherwise, the behavior is undefined + [For more information, see PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) + }]; + + let assemblyFormat = "$scope $addr `,` $size (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict"; + let llvmBuilder = [{ + createIntrinsicCall( + builder, + getUnidirectionalFenceProxyID($fromProxy, $toProxy, $scope, false), + {$addr, $size}); + }]; + + let hasVerifier = 1; +} + +def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">, + Arguments<(ins MemScopeKindAttr:$scope, + DefaultValuedAttr:$fromProxy, + DefaultValuedAttr:$toProxy)> { + let summary = "Uni-directional proxy fence operation with release semantics"; + let description = [{ + `fence.proxy.release` is a uni-directional fence used to establish ordering + between a prior memory access performed via the generic proxy and a + subsequent memory access performed via the tensormap proxy. `fence.proxy.release` + operation can form a release sequence that synchronizes with an acquire + sequence that contains the fence.proxy.acquire proxy fence operation + [For more information, see PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) + }]; + + let assemblyFormat = "$scope (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict"; + let llvmBuilder = [{ + createIntrinsicCall(builder, getUnidirectionalFenceProxyID( + $fromProxy, $toProxy, $scope, true)); + }]; + + let hasVerifier = 1; +} + def SetMaxRegisterActionIncrease : I32EnumAttrCase<"increase", 0>; def SetMaxRegisterActionDecrease : I32EnumAttrCase<"decrease", 1>; def SetMaxRegisterAction : I32EnumAttr<"SetMaxRegisterAction", "NVVM set max register action", diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 4d1896551101ed..2c7c3e9d535f7d 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -1004,6 +1004,10 @@ void NVVM::WgmmaMmaAsyncOp::getAsmValues( } } LogicalResult NVVM::FenceProxyOp::verify() { + if (getKind() == NVVM::ProxyKind::TENSORMAP) + return emitOpError() << "tensormap proxy is not a supported proxy kind"; + if (getKind() == NVVM::ProxyKind::GENERIC) + return emitOpError() << "generic proxy not a supported proxy kind"; if (getKind() == NVVM::ProxyKind::async_shared && !getSpace().has_value()) { return emitOpError() << "async_shared fence requires space attribute"; } @@ -1013,6 +1017,30 @@ LogicalResult NVVM::FenceProxyOp::verify() { return success(); } +LogicalResult NVVM::FenceProxyAcquireOp::verify() { + if (getFromProxy() != NVVM::ProxyKind::GENERIC) + return emitOpError("uni-directional proxies only support generic for " + "from_proxy attribute"); + + if (getToProxy() != NVVM::ProxyKind::TENSORMAP) + return emitOpError("uni-directional proxies only support tensormap " + "for to_proxy attribute"); + + return success(); +} + +LogicalResult NVVM::FenceProxyReleaseOp::verify() { + if (getFromProxy() != NVVM::ProxyKind::GENERIC) + return emitOpError("uni-directional proxies only support generic for " + "from_proxy attribute"); + + if (getToProxy() != NVVM::ProxyKind::TENSORMAP) + return emitOpError("uni-directional proxies only support tensormap " + "for to_proxy attribute"); + + return success(); +} + LogicalResult NVVM::SetMaxRegisterOp::verify() { if (getRegCount() % 8) return emitOpError("new register size must be multiple of 8"); diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp index a09c24dda82afc..f93e1cc8780c79 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp @@ -120,6 +120,40 @@ static llvm::Intrinsic::ID getLdMatrixIntrinsicId(NVVM::MMALayout layout, } } +static unsigned getUnidirectionalFenceProxyID(NVVM::ProxyKind fromProxy, + NVVM::ProxyKind toProxy, + NVVM::MemScopeKind scope, + bool isRelease) { + if (fromProxy == NVVM::ProxyKind::GENERIC && + toProxy == NVVM::ProxyKind::TENSORMAP) { + switch (scope) { + case NVVM::MemScopeKind::CTA: { + if (isRelease) + return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_cta; + return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_cta; + } + case NVVM::MemScopeKind::CLUSTER: { + if (isRelease) + return llvm::Intrinsic:: + nvvm_fence_proxy_tensormap_generic_release_cluster; + return llvm::Intrinsic:: + nvvm_fence_proxy_tensormap_generic_acquire_cluster; + } + case NVVM::MemScopeKind::GPU: { + if (isRelease) + return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_gpu; + return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_gpu; + } + case NVVM::MemScopeKind::SYS: { + if (isRelease) + return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_sys; + return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_sys; + } + } + llvm_unreachable("Unknown scope for uni-directional fence.proxy operation"); + } +} + namespace { /// Implementation of the dialect interface that converts operations belonging /// to the NVVM dialect to LLVM IR. diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir new file mode 100644 index 00000000000000..0e563808da970b --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir @@ -0,0 +1,33 @@ +// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s + +// ----- + +llvm.func @nvvm_fence_proxy_acquire(%addr : !llvm.ptr, %size : i32) { + // expected-error @below {{'nvvm.fence.proxy.acquire' op uni-directional proxies only support generic for from_proxy attribute}} + nvvm.fence.proxy.acquire #nvvm.mem_scope %addr, %size from_proxy=#nvvm.proxy_kind to_proxy=#nvvm.proxy_kind + llvm.return +} + +// ----- + +llvm.func @nvvm_fence_proxy_release() { + // expected-error @below {{'nvvm.fence.proxy.release' op uni-directional proxies only support generic for from_proxy attribute}} + nvvm.fence.proxy.release #nvvm.mem_scope from_proxy=#nvvm.proxy_kind to_proxy=#nvvm.proxy_kind + llvm.return +} + +// ----- + +llvm.func @nvvm_fence_proxy_acquire(%addr : !llvm.ptr, %size : i32) { + // expected-error @below {{'nvvm.fence.proxy.acquire' op uni-directional proxies only support tensormap for to_proxy attribute}} + nvvm.fence.proxy.acquire #nvvm.mem_scope %addr, %size from_proxy=#nvvm.proxy_kind to_proxy=#nvvm.proxy_kind + llvm.return +} + +// ----- + +llvm.func @nvvm_fence_proxy_release() { + // expected-error @below {{'nvvm.fence.proxy.release' op uni-directional proxies only support tensormap for to_proxy attribute}} + nvvm.fence.proxy.release #nvvm.mem_scope from_proxy=#nvvm.proxy_kind to_proxy=#nvvm.proxy_kind + llvm.return +} \ No newline at end of file diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index a8ae4d97888c90..6e2787d121ae64 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -574,3 +574,40 @@ llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}) llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}, %arg1: f32, %arg2: !llvm.ptr {llvm.byval = f32, nvvm.grid_constant}) attributes {nvvm.kernel} { llvm.return } + + +// ----- +// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_release +llvm.func @nvvm_fence_proxy_tensormap_generic_release() { + %c128 = llvm.mlir.constant(128) : i32 + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta() + nvvm.fence.proxy.release #nvvm.mem_scope + + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster() + nvvm.fence.proxy.release #nvvm.mem_scope + + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu() + nvvm.fence.proxy.release #nvvm.mem_scope + + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys() + nvvm.fence.proxy.release #nvvm.mem_scope + llvm.return +} + +// ----- +// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_acquire +llvm.func @nvvm_fence_proxy_tensormap_generic_acquire(%addr : !llvm.ptr) { + %c128 = llvm.mlir.constant(128) : i32 + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr {{%[0-9]+}}, i32 128) + nvvm.fence.proxy.acquire #nvvm.mem_scope %addr, %c128 + + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr {{%[0-9]+}}, i32 128) + nvvm.fence.proxy.acquire #nvvm.mem_scope %addr, %c128 + + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr {{%[0-9]+}}, i32 128) + nvvm.fence.proxy.acquire #nvvm.mem_scope %addr, %c128 + + // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr {{%[0-9]+}}, i32 128) + nvvm.fence.proxy.acquire #nvvm.mem_scope %addr, %c128 + llvm.return +} \ No newline at end of file From fef3426ad3d8d5bf01941438467df318d00c6279 Mon Sep 17 00:00:00 2001 From: Chris Apple Date: Fri, 30 Aug 2024 07:48:31 -0700 Subject: [PATCH 41/98] Revert "[LLVM][rtsan] Add LLVM nosanitize_realtime attribute (#105447)" (#106743) This reverts commit 178fc4779ece31392a2cd01472b0279e50b3a199. This attribute was not needed now that we are using the lsan style ScopedDisabler for disabling this sanitizer See #106736 #106125 For more discussion --- llvm/docs/LangRef.rst | 5 ----- llvm/include/llvm/Bitcode/LLVMBitCodes.h | 1 - llvm/include/llvm/IR/Attributes.td | 3 --- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 2 -- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 2 -- llvm/lib/IR/Verifier.cpp | 6 ------ llvm/lib/Transforms/Utils/CodeExtractor.cpp | 1 - llvm/test/Bitcode/attributes.ll | 7 ------- llvm/test/Bitcode/compatibility.ll | 8 ++------ llvm/test/Verifier/rtsan-attrs.ll | 9 --------- 10 files changed, 2 insertions(+), 42 deletions(-) delete mode 100644 llvm/test/Verifier/rtsan-attrs.ll diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index cf0a6f96fb012e..c75b75edaf2ca0 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -2189,10 +2189,6 @@ example: ``nosanitize_coverage`` This attribute indicates that SanitizerCoverage instrumentation is disabled for this function. -``nosanitize_realtime`` - This attribute indicates that the Realtime Sanitizer instrumentation is - disabled for this function. - This attribute is incompatible with the ``sanitize_realtime`` attribute. ``null_pointer_is_valid`` If ``null_pointer_is_valid`` is set, then the ``null`` address in address-space 0 is considered to be a valid address for memory loads and @@ -2319,7 +2315,6 @@ example: This attribute indicates that RealtimeSanitizer checks (realtime safety analysis - no allocations, syscalls or exceptions) are enabled for this function. - This attribute is incompatible with the ``nosanitize_realtime`` attribute. ``speculative_load_hardening`` This attribute indicates that `Speculative Load Hardening `_ diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index 8a2e6583af87c5..4beac37a583445 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -759,7 +759,6 @@ enum AttributeKindCodes { ATTR_KIND_INITIALIZES = 94, ATTR_KIND_HYBRID_PATCHABLE = 95, ATTR_KIND_SANITIZE_REALTIME = 96, - ATTR_KIND_NO_SANITIZE_REALTIME = 97, }; enum ComdatSelectionKindCodes { diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td index 80936c0ee83355..891e34fec0c798 100644 --- a/llvm/include/llvm/IR/Attributes.td +++ b/llvm/include/llvm/IR/Attributes.td @@ -212,9 +212,6 @@ def NoSanitizeBounds : EnumAttr<"nosanitize_bounds", [FnAttr]>; /// No SanitizeCoverage instrumentation. def NoSanitizeCoverage : EnumAttr<"nosanitize_coverage", [FnAttr]>; -/// No SanitizeRealtime instrumentation. -def NoSanitizeRealtime : EnumAttr<"nosanitize_realtime", [FnAttr]>; - /// Null pointer in address space zero is valid. def NullPointerIsValid : EnumAttr<"null_pointer_is_valid", [FnAttr]>; diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 974a05023c72a5..654be985a3229c 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2093,8 +2093,6 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::NoSanitizeBounds; case bitc::ATTR_KIND_NO_SANITIZE_COVERAGE: return Attribute::NoSanitizeCoverage; - case bitc::ATTR_KIND_NO_SANITIZE_REALTIME: - return Attribute::NoSanitizeRealtime; case bitc::ATTR_KIND_NULL_POINTER_IS_VALID: return Attribute::NullPointerIsValid; case bitc::ATTR_KIND_OPTIMIZE_FOR_DEBUGGING: diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 3c5097f4af7c56..26fd02b3e1a043 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -795,8 +795,6 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_NO_SANITIZE_BOUNDS; case Attribute::NoSanitizeCoverage: return bitc::ATTR_KIND_NO_SANITIZE_COVERAGE; - case llvm::Attribute::NoSanitizeRealtime: - return bitc::ATTR_KIND_NO_SANITIZE_REALTIME; case Attribute::NullPointerIsValid: return bitc::ATTR_KIND_NULL_POINTER_IS_VALID; case Attribute::OptimizeForDebugging: diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 79b3ca3b6a5a7e..d8f3bab45b2a65 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -2223,12 +2223,6 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, "Attributes 'optdebug and optnone' are incompatible!", V); } - Check(!(Attrs.hasFnAttr(Attribute::SanitizeRealtime) && - Attrs.hasFnAttr(Attribute::NoSanitizeRealtime)), - "Attributes " - "'sanitize_realtime and nosanitize_realtime' are incompatible!", - V); - if (Attrs.hasFnAttr(Attribute::OptimizeForDebugging)) { Check(!Attrs.hasFnAttr(Attribute::OptimizeForSize), "Attributes 'optsize and optdebug' are incompatible!", V); diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index cf00299812bb7f..d378c6c3a4b01c 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -937,7 +937,6 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, case Attribute::NoUnwind: case Attribute::NoSanitizeBounds: case Attribute::NoSanitizeCoverage: - case Attribute::NoSanitizeRealtime: case Attribute::NullPointerIsValid: case Attribute::OptimizeForDebugging: case Attribute::OptForFuzzing: diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll index 835622276ef279..4402289ac170d9 100644 --- a/llvm/test/Bitcode/attributes.ll +++ b/llvm/test/Bitcode/attributes.ll @@ -511,12 +511,6 @@ define void @f92() sanitize_realtime ret void; } -; CHECK: define void @f93() #54 -define void @f93() nosanitize_realtime -{ - ret void; -} - ; CHECK: define void @f87() [[FNRETTHUNKEXTERN:#[0-9]+]] define void @f87() fn_ret_thunk_extern { ret void } @@ -612,7 +606,6 @@ define void @initializes(ptr initializes((-4, 0), (4, 8)) %a) { ; CHECK: attributes #51 = { uwtable(sync) } ; CHECK: attributes #52 = { nosanitize_bounds } ; CHECK: attributes #53 = { sanitize_realtime } -; CHECK: attributes #54 = { nosanitize_realtime } ; CHECK: attributes [[FNRETTHUNKEXTERN]] = { fn_ret_thunk_extern } ; CHECK: attributes [[SKIPPROFILE]] = { skipprofile } ; CHECK: attributes [[OPTDEBUG]] = { optdebug } diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll index c401cde8e146e7..fd60c49a4be39b 100644 --- a/llvm/test/Bitcode/compatibility.ll +++ b/llvm/test/Bitcode/compatibility.ll @@ -1562,7 +1562,7 @@ exit: ; CHECK: select <2 x i1> , <2 x i8> , <2 x i8> call void @f.nobuiltin() builtin - ; CHECK: call void @f.nobuiltin() #54 + ; CHECK: call void @f.nobuiltin() #53 call fastcc noalias ptr @f.noalias() noinline ; CHECK: call fastcc noalias ptr @f.noalias() #12 @@ -1992,9 +1992,6 @@ declare void @f.sanitize_numerical_stability() sanitize_numerical_stability declare void @f.sanitize_realtime() sanitize_realtime ; CHECK: declare void @f.sanitize_realtime() #52 -declare void @f.nosanitize_realtime() nosanitize_realtime -; CHECK: declare void @f.nosanitize_realtime() #53 - ; CHECK: declare nofpclass(snan) float @nofpclass_snan(float nofpclass(snan)) declare nofpclass(snan) float @nofpclass_snan(float nofpclass(snan)) @@ -2118,8 +2115,7 @@ define float @nofpclass_callsites(float %arg) { ; CHECK: attributes #50 = { allockind("alloc,uninitialized") } ; CHECK: attributes #51 = { sanitize_numerical_stability } ; CHECK: attributes #52 = { sanitize_realtime } -; CHECK: attributes #53 = { nosanitize_realtime } -; CHECK: attributes #54 = { builtin } +; CHECK: attributes #53 = { builtin } ;; Metadata diff --git a/llvm/test/Verifier/rtsan-attrs.ll b/llvm/test/Verifier/rtsan-attrs.ll deleted file mode 100644 index 42ab85163642b1..00000000000000 --- a/llvm/test/Verifier/rtsan-attrs.ll +++ /dev/null @@ -1,9 +0,0 @@ -; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s - -; CHECK: Attributes 'sanitize_realtime and nosanitize_realtime' are incompatible! -; CHECK-NEXT: ptr @sanitize_nosanitize -define void @sanitize_nosanitize() #0 { - ret void -} - -attributes #0 = { sanitize_realtime nosanitize_realtime } From 82a5ab756fdbce432794c00bdeeb95aa7e403d3f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 30 Aug 2024 15:28:59 +0100 Subject: [PATCH 42/98] [X86] x86-vperm.ll - strip superfluous semicolon check lines. NFC. --- .../Transforms/InstCombine/X86/x86-vperm.ll | 112 ------------------ 1 file changed, 112 deletions(-) diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll b/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll index a0e2d3d6fe9fbe..eaa1653f6505d4 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll @@ -4,7 +4,6 @@ declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) { -; ; CHECK-LABEL: @identity_test_permvar_si_256( ; CHECK-NEXT: ret <8 x i32> [[A0:%.*]] ; @@ -13,7 +12,6 @@ define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) { } define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { -; ; CHECK-LABEL: @identity_test_permvar_si_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[A0:%.*]], <8 x i32> [[PASSTHRU:%.*]] @@ -26,7 +24,6 @@ define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %pa } define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) { -; ; CHECK-LABEL: @zero_test_permvar_si_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x i32> [[TMP1]] @@ -36,7 +33,6 @@ define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) { } define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { -; ; CHECK-LABEL: @zero_test_permvar_si_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -50,7 +46,6 @@ define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passth } define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) { -; ; CHECK-LABEL: @shuffle_test_permvar_si_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] @@ -60,7 +55,6 @@ define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) { } define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { -; ; CHECK-LABEL: @shuffle_test_permvar_si_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -74,7 +68,6 @@ define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %pas } define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) { -; ; CHECK-LABEL: @undef_test_permvar_si_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] @@ -84,7 +77,6 @@ define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) { } define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { -; ; CHECK-LABEL: @undef_test_permvar_si_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -100,7 +92,6 @@ define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passt declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) { -; ; CHECK-LABEL: @identity_test_permvar_sf_256( ; CHECK-NEXT: ret <8 x float> [[A0:%.*]] ; @@ -109,7 +100,6 @@ define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) { } define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { -; ; CHECK-LABEL: @identity_test_permvar_sf_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[A0:%.*]], <8 x float> [[PASSTHRU:%.*]] @@ -122,7 +112,6 @@ define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x floa } define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) { -; ; CHECK-LABEL: @zero_test_permvar_sf_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x float> [[TMP1]] @@ -132,7 +121,6 @@ define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) { } define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { -; ; CHECK-LABEL: @zero_test_permvar_sf_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -146,7 +134,6 @@ define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> % } define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) { -; ; CHECK-LABEL: @shuffle_test_permvar_sf_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> ; CHECK-NEXT: ret <8 x float> [[TMP1]] @@ -156,7 +143,6 @@ define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) { } define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { -; ; CHECK-LABEL: @shuffle_test_permvar_sf_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -170,7 +156,6 @@ define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float } define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) { -; ; CHECK-LABEL: @undef_test_permvar_sf_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> ; CHECK-NEXT: ret <8 x float> [[TMP1]] @@ -180,7 +165,6 @@ define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) { } define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { -; ; CHECK-LABEL: @undef_test_permvar_sf_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -196,7 +180,6 @@ define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) { -; ; CHECK-LABEL: @identity_test_permvar_di_256( ; CHECK-NEXT: ret <4 x i64> [[A0:%.*]] ; @@ -205,7 +188,6 @@ define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) { } define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { -; ; CHECK-LABEL: @identity_test_permvar_di_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> @@ -220,7 +202,6 @@ define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %pa } define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) { -; ; CHECK-LABEL: @zero_test_permvar_di_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: ret <4 x i64> [[TMP1]] @@ -230,7 +211,6 @@ define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) { } define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { -; ; CHECK-LABEL: @zero_test_permvar_di_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -246,7 +226,6 @@ define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passth } define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) { -; ; CHECK-LABEL: @shuffle_test_permvar_di_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i64> [[TMP1]] @@ -256,7 +235,6 @@ define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) { } define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { -; ; CHECK-LABEL: @shuffle_test_permvar_di_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -272,7 +250,6 @@ define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %pas } define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) { -; ; CHECK-LABEL: @undef_test_permvar_di_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i64> [[TMP1]] @@ -282,7 +259,6 @@ define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) { } define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { -; ; CHECK-LABEL: @undef_test_permvar_di_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -300,7 +276,6 @@ define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passt declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>) define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) { -; ; CHECK-LABEL: @identity_test_permvar_df_256( ; CHECK-NEXT: ret <4 x double> [[A0:%.*]] ; @@ -309,7 +284,6 @@ define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) { } define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { -; ; CHECK-LABEL: @identity_test_permvar_df_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> @@ -324,7 +298,6 @@ define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x do } define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) { -; ; CHECK-LABEL: @zero_test_permvar_df_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: ret <4 x double> [[TMP1]] @@ -334,7 +307,6 @@ define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) { } define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { -; ; CHECK-LABEL: @zero_test_permvar_df_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -350,7 +322,6 @@ define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double } define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) { -; ; CHECK-LABEL: @shuffle_test_permvar_df_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: ret <4 x double> [[TMP1]] @@ -360,7 +331,6 @@ define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) { } define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { -; ; CHECK-LABEL: @shuffle_test_permvar_df_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -376,7 +346,6 @@ define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x dou } define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) { -; ; CHECK-LABEL: @undef_test_permvar_df_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: ret <4 x double> [[TMP1]] @@ -386,7 +355,6 @@ define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) { } define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { -; ; CHECK-LABEL: @undef_test_permvar_df_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -404,7 +372,6 @@ define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x doubl declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) { -; ; CHECK-LABEL: @identity_test_permvar_si_512( ; CHECK-NEXT: ret <16 x i32> [[A0:%.*]] ; @@ -413,7 +380,6 @@ define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) { } define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { -; ; CHECK-LABEL: @identity_test_permvar_si_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[A0:%.*]], <16 x i32> [[PASSTHRU:%.*]] @@ -426,7 +392,6 @@ define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> } define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) { -; ; CHECK-LABEL: @zero_test_permvar_si_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: ret <16 x i32> [[TMP1]] @@ -436,7 +401,6 @@ define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) { } define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { -; ; CHECK-LABEL: @zero_test_permvar_si_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -450,7 +414,6 @@ define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %pas } define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) { -; ; CHECK-LABEL: @shuffle_test_permvar_si_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[TMP1]] @@ -460,7 +423,6 @@ define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) { } define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { -; ; CHECK-LABEL: @shuffle_test_permvar_si_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -474,7 +436,6 @@ define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> % } define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) { -; ; CHECK-LABEL: @undef_test_permvar_si_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[TMP1]] @@ -484,7 +445,6 @@ define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) { } define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { -; ; CHECK-LABEL: @undef_test_permvar_si_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -500,7 +460,6 @@ define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %pa declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) { -; ; CHECK-LABEL: @identity_test_permvar_sf_512( ; CHECK-NEXT: ret <16 x float> [[A0:%.*]] ; @@ -509,7 +468,6 @@ define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) { } define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { -; ; CHECK-LABEL: @identity_test_permvar_sf_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[A0:%.*]], <16 x float> [[PASSTHRU:%.*]] @@ -522,7 +480,6 @@ define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x f } define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) { -; ; CHECK-LABEL: @zero_test_permvar_sf_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -532,7 +489,6 @@ define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) { } define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { -; ; CHECK-LABEL: @zero_test_permvar_sf_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -546,7 +502,6 @@ define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float } define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) { -; ; CHECK-LABEL: @shuffle_test_permvar_sf_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -556,7 +511,6 @@ define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) { } define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { -; ; CHECK-LABEL: @shuffle_test_permvar_sf_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -570,7 +524,6 @@ define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x fl } define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) { -; ; CHECK-LABEL: @undef_test_permvar_sf_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -580,7 +533,6 @@ define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) { } define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { -; ; CHECK-LABEL: @undef_test_permvar_sf_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -596,7 +548,6 @@ define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x floa declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) { -; ; CHECK-LABEL: @identity_test_permvar_di_512( ; CHECK-NEXT: ret <8 x i64> [[A0:%.*]] ; @@ -605,7 +556,6 @@ define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) { } define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { -; ; CHECK-LABEL: @identity_test_permvar_di_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[A0:%.*]], <8 x i64> [[PASSTHRU:%.*]] @@ -618,7 +568,6 @@ define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %pa } define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) { -; ; CHECK-LABEL: @zero_test_permvar_di_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x i64> [[TMP1]] @@ -628,7 +577,6 @@ define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) { } define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { -; ; CHECK-LABEL: @zero_test_permvar_di_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -642,7 +590,6 @@ define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passth } define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) { -; ; CHECK-LABEL: @shuffle_test_permvar_di_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i64> [[TMP1]] @@ -652,7 +599,6 @@ define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) { } define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { -; ; CHECK-LABEL: @shuffle_test_permvar_di_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -666,7 +612,6 @@ define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %pas } define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) { -; ; CHECK-LABEL: @undef_test_permvar_di_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i64> [[TMP1]] @@ -676,7 +621,6 @@ define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) { } define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { -; ; CHECK-LABEL: @undef_test_permvar_di_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -692,7 +636,6 @@ define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passt declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) { -; ; CHECK-LABEL: @identity_test_permvar_df_512( ; CHECK-NEXT: ret <8 x double> [[A0:%.*]] ; @@ -701,7 +644,6 @@ define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) { } define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { -; ; CHECK-LABEL: @identity_test_permvar_df_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[A0:%.*]], <8 x double> [[PASSTHRU:%.*]] @@ -714,7 +656,6 @@ define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x do } define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) { -; ; CHECK-LABEL: @zero_test_permvar_df_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -724,7 +665,6 @@ define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) { } define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { -; ; CHECK-LABEL: @zero_test_permvar_df_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -738,7 +678,6 @@ define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double } define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) { -; ; CHECK-LABEL: @shuffle_test_permvar_df_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -748,7 +687,6 @@ define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) { } define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { -; ; CHECK-LABEL: @shuffle_test_permvar_df_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -762,7 +700,6 @@ define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x dou } define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) { -; ; CHECK-LABEL: @undef_test_permvar_df_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -772,7 +709,6 @@ define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) { } define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { -; ; CHECK-LABEL: @undef_test_permvar_df_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -788,7 +724,6 @@ define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x doubl declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>) define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) { -; ; CHECK-LABEL: @identity_test_permvar_hi_128( ; CHECK-NEXT: ret <8 x i16> [[A0:%.*]] ; @@ -797,7 +732,6 @@ define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) { } define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { -; ; CHECK-LABEL: @identity_test_permvar_hi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A0:%.*]], <8 x i16> [[PASSTHRU:%.*]] @@ -810,7 +744,6 @@ define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %pa } define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) { -; ; CHECK-LABEL: @zero_test_permvar_hi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x i16> [[TMP1]] @@ -820,7 +753,6 @@ define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) { } define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { -; ; CHECK-LABEL: @zero_test_permvar_hi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -834,7 +766,6 @@ define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passth } define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) { -; ; CHECK-LABEL: @shuffle_test_permvar_hi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[TMP1]] @@ -844,7 +775,6 @@ define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) { } define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { -; ; CHECK-LABEL: @shuffle_test_permvar_hi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -858,7 +788,6 @@ define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %pas } define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) { -; ; CHECK-LABEL: @undef_test_permvar_hi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[TMP1]] @@ -868,7 +797,6 @@ define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) { } define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { -; ; CHECK-LABEL: @undef_test_permvar_hi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -884,7 +812,6 @@ define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passt declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>) define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) { -; ; CHECK-LABEL: @identity_test_permvar_hi_256( ; CHECK-NEXT: ret <16 x i16> [[A0:%.*]] ; @@ -893,7 +820,6 @@ define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) { } define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { -; ; CHECK-LABEL: @identity_test_permvar_hi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[A0:%.*]], <16 x i16> [[PASSTHRU:%.*]] @@ -906,7 +832,6 @@ define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> } define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) { -; ; CHECK-LABEL: @zero_test_permvar_hi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: ret <16 x i16> [[TMP1]] @@ -916,7 +841,6 @@ define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) { } define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { -; ; CHECK-LABEL: @zero_test_permvar_hi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -930,7 +854,6 @@ define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %pas } define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) { -; ; CHECK-LABEL: @shuffle_test_permvar_hi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i16> [[TMP1]] @@ -940,7 +863,6 @@ define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) { } define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { -; ; CHECK-LABEL: @shuffle_test_permvar_hi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -954,7 +876,6 @@ define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> % } define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) { -; ; CHECK-LABEL: @undef_test_permvar_hi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i16> [[TMP1]] @@ -964,7 +885,6 @@ define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) { } define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { -; ; CHECK-LABEL: @undef_test_permvar_hi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -980,7 +900,6 @@ define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %pa declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) { -; ; CHECK-LABEL: @identity_test_permvar_hi_512( ; CHECK-NEXT: ret <32 x i16> [[A0:%.*]] ; @@ -989,7 +908,6 @@ define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) { } define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { -; ; CHECK-LABEL: @identity_test_permvar_hi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> [[A0:%.*]], <32 x i16> [[PASSTHRU:%.*]] @@ -1002,7 +920,6 @@ define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> } define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) { -; ; CHECK-LABEL: @zero_test_permvar_hi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: ret <32 x i16> [[TMP1]] @@ -1012,7 +929,6 @@ define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) { } define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { -; ; CHECK-LABEL: @zero_test_permvar_hi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -1026,7 +942,6 @@ define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %pas } define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) { -; ; CHECK-LABEL: @shuffle_test_permvar_hi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> ; CHECK-NEXT: ret <32 x i16> [[TMP1]] @@ -1036,7 +951,6 @@ define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) { } define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { -; ; CHECK-LABEL: @shuffle_test_permvar_hi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -1050,7 +964,6 @@ define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> % } define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) { -; ; CHECK-LABEL: @undef_test_permvar_hi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> ; CHECK-NEXT: ret <32 x i16> [[TMP1]] @@ -1060,7 +973,6 @@ define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) { } define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { -; ; CHECK-LABEL: @undef_test_permvar_hi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -1076,7 +988,6 @@ define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %pa declare <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8>, <16 x i8>) define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) { -; ; CHECK-LABEL: @identity_test_permvar_qi_128( ; CHECK-NEXT: ret <16 x i8> [[A0:%.*]] ; @@ -1085,7 +996,6 @@ define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) { } define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { -; ; CHECK-LABEL: @identity_test_permvar_qi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A0:%.*]], <16 x i8> [[PASSTHRU:%.*]] @@ -1098,7 +1008,6 @@ define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %pa } define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) { -; ; CHECK-LABEL: @zero_test_permvar_qi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: ret <16 x i8> [[TMP1]] @@ -1108,7 +1017,6 @@ define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) { } define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { -; ; CHECK-LABEL: @zero_test_permvar_qi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -1122,7 +1030,6 @@ define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passth } define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) { -; ; CHECK-LABEL: @shuffle_test_permvar_qi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] @@ -1132,7 +1039,6 @@ define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) { } define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { -; ; CHECK-LABEL: @shuffle_test_permvar_qi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -1146,7 +1052,6 @@ define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %pas } define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) { -; ; CHECK-LABEL: @undef_test_permvar_qi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] @@ -1156,7 +1061,6 @@ define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) { } define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { -; ; CHECK-LABEL: @undef_test_permvar_qi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -1172,7 +1076,6 @@ define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passt declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>) define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) { -; ; CHECK-LABEL: @identity_test_permvar_qi_256( ; CHECK-NEXT: ret <32 x i8> [[A0:%.*]] ; @@ -1181,7 +1084,6 @@ define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) { } define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { -; ; CHECK-LABEL: @identity_test_permvar_qi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A0:%.*]], <32 x i8> [[PASSTHRU:%.*]] @@ -1194,7 +1096,6 @@ define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %pa } define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) { -; ; CHECK-LABEL: @zero_test_permvar_qi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: ret <32 x i8> [[TMP1]] @@ -1204,7 +1105,6 @@ define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) { } define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { -; ; CHECK-LABEL: @zero_test_permvar_qi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -1218,7 +1118,6 @@ define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passth } define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) { -; ; CHECK-LABEL: @shuffle_test_permvar_qi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> ; CHECK-NEXT: ret <32 x i8> [[TMP1]] @@ -1228,7 +1127,6 @@ define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) { } define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { -; ; CHECK-LABEL: @shuffle_test_permvar_qi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -1242,7 +1140,6 @@ define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %pas } define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) { -; ; CHECK-LABEL: @undef_test_permvar_qi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> ; CHECK-NEXT: ret <32 x i8> [[TMP1]] @@ -1252,7 +1149,6 @@ define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) { } define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { -; ; CHECK-LABEL: @undef_test_permvar_qi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -1268,7 +1164,6 @@ define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passt declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) { -; ; CHECK-LABEL: @identity_test_permvar_qi_512( ; CHECK-NEXT: ret <64 x i8> [[A0:%.*]] ; @@ -1277,7 +1172,6 @@ define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) { } define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { -; ; CHECK-LABEL: @identity_test_permvar_qi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> [[A0:%.*]], <64 x i8> [[PASSTHRU:%.*]] @@ -1290,7 +1184,6 @@ define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %pa } define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) { -; ; CHECK-LABEL: @zero_test_permvar_qi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer ; CHECK-NEXT: ret <64 x i8> [[TMP1]] @@ -1300,7 +1193,6 @@ define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) { } define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { -; ; CHECK-LABEL: @zero_test_permvar_qi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> @@ -1314,7 +1206,6 @@ define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passth } define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) { -; ; CHECK-LABEL: @shuffle_test_permvar_qi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> ; CHECK-NEXT: ret <64 x i8> [[TMP1]] @@ -1324,7 +1215,6 @@ define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) { } define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { -; ; CHECK-LABEL: @shuffle_test_permvar_qi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> @@ -1338,7 +1228,6 @@ define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %pas } define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) { -; ; CHECK-LABEL: @undef_test_permvar_qi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> ; CHECK-NEXT: ret <64 x i8> [[TMP1]] @@ -1348,7 +1237,6 @@ define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) { } define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { -; ; CHECK-LABEL: @undef_test_permvar_qi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> From d01e0f7fb10ff1f9e2b797ce8437c701dfd58cbc Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 30 Aug 2024 15:48:45 +0100 Subject: [PATCH 43/98] [InstCombine][X86] Add vpermv/vpermv3 test coverage for #106413 --- .../Transforms/InstCombine/X86/x86-vperm.ll | 154 +++++++++++++++++ .../Transforms/InstCombine/X86/x86-vpermi2.ll | 156 ++++++++++++++++++ 2 files changed, 310 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll b/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll index eaa1653f6505d4..6519e4f5348484 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll @@ -89,6 +89,17 @@ define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passt ret <8 x i32> %3 } +define <8 x i32> @demandedbit_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %a1) { +; CHECK-LABEL: @demandedbit_test_permvar_si_256_mask( +; CHECK-NEXT: [[M:%.*]] = or <8 x i32> [[A1:%.*]], +; CHECK-NEXT: [[S:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[M]]) +; CHECK-NEXT: ret <8 x i32> [[S]] +; + %m = or <8 x i32> %a1, + %s = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %m) + ret <8 x i32> %s +} + declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) { @@ -177,6 +188,17 @@ define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> ret <8 x float> %3 } +define <8 x float> @demandedbit_test_permvar_sf_256_mask(<8 x float> %a0, <8 x i32> %a1) { +; CHECK-LABEL: @demandedbit_test_permvar_sf_256_mask( +; CHECK-NEXT: [[M:%.*]] = or <8 x i32> [[A1:%.*]], +; CHECK-NEXT: [[S:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[M]]) +; CHECK-NEXT: ret <8 x float> [[S]] +; + %m = or <8 x i32> %a1, + %s = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %m) + ret <8 x float> %s +} + declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) { @@ -273,6 +295,17 @@ define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passt ret <4 x i64> %3 } +define <4 x i64> @demandedbits_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %a1) { +; CHECK-LABEL: @demandedbits_test_permvar_di_256_mask( +; CHECK-NEXT: [[M:%.*]] = or <4 x i64> [[A1:%.*]], +; CHECK-NEXT: [[S:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[A0:%.*]], <4 x i64> [[M]]) +; CHECK-NEXT: ret <4 x i64> [[S]] +; + %m = or <4 x i64> %a1, + %s = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %m) + ret <4 x i64> %s +} + declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>) define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) { @@ -369,6 +402,17 @@ define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x doubl ret <4 x double> %3 } +define <4 x double> @demandedbits_test_permvar_df_256_mask(<4 x double> %a0, <4 x i64> %a1) { +; CHECK-LABEL: @demandedbits_test_permvar_df_256_mask( +; CHECK-NEXT: [[M:%.*]] = or <4 x i64> [[A1:%.*]], +; CHECK-NEXT: [[S:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[A0:%.*]], <4 x i64> [[M]]) +; CHECK-NEXT: ret <4 x double> [[S]] +; + %m = or <4 x i64> %a1, + %s = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> %m) + ret <4 x double> %s +} + declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) { @@ -457,6 +501,17 @@ define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %pa ret <16 x i32> %3 } +define <16 x i32> @demandedbit_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %a1) { +; CHECK-LABEL: @demandedbit_test_permvar_si_512_mask( +; CHECK-NEXT: [[M:%.*]] = or <16 x i32> [[A1:%.*]], +; CHECK-NEXT: [[S:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[A0:%.*]], <16 x i32> [[M]]) +; CHECK-NEXT: ret <16 x i32> [[S]] +; + %m = or <16 x i32> %a1, + %s = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> %m) + ret <16 x i32> %s +} + declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) { @@ -545,6 +600,17 @@ define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x floa ret <16 x float> %3 } +define <16 x float> @demandedbit_test_permvar_sf_512_mask(<16 x float> %a0, <16 x i32> %a1) { +; CHECK-LABEL: @demandedbit_test_permvar_sf_512_mask( +; CHECK-NEXT: [[M:%.*]] = or <16 x i32> [[A1:%.*]], +; CHECK-NEXT: [[S:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[A0:%.*]], <16 x i32> [[M]]) +; CHECK-NEXT: ret <16 x float> [[S]] +; + %m = or <16 x i32> %a1, + %s = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> %m) + ret <16 x float> %s +} + declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) { @@ -633,6 +699,17 @@ define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passt ret <8 x i64> %3 } +define <8 x i64> @demandedbit_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %a1) { +; CHECK-LABEL: @demandedbit_test_permvar_di_512_mask( +; CHECK-NEXT: [[M:%.*]] = or <8 x i64> [[A1:%.*]], +; CHECK-NEXT: [[S:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[A0:%.*]], <8 x i64> [[M]]) +; CHECK-NEXT: ret <8 x i64> [[S]] +; + %m = or <8 x i64> %a1, + %s = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %m) + ret <8 x i64> %s +} + declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) { @@ -721,6 +798,17 @@ define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x doubl ret <8 x double> %3 } +define <8 x double> @demandedbit_test_permvar_df_512_mask(<8 x double> %a0, <8 x i64> %a1) { +; CHECK-LABEL: @demandedbit_test_permvar_df_512_mask( +; CHECK-NEXT: [[M:%.*]] = or <8 x i64> [[A1:%.*]], +; CHECK-NEXT: [[S:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[A0:%.*]], <8 x i64> [[M]]) +; CHECK-NEXT: ret <8 x double> [[S]] +; + %m = or <8 x i64> %a1, + %s = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> %m) + ret <8 x double> %s +} + declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>) define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) { @@ -809,6 +897,17 @@ define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passt ret <8 x i16> %3 } +define <8 x i16> @demandedbit_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: @demandedbit_test_permvar_hi_128_mask( +; CHECK-NEXT: [[M:%.*]] = or <8 x i16> [[A1:%.*]], +; CHECK-NEXT: [[S:%.*]] = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> [[A0:%.*]], <8 x i16> [[M]]) +; CHECK-NEXT: ret <8 x i16> [[S]] +; + %m = or <8 x i16> %a1, + %s = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> %m) + ret <8 x i16> %s +} + declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>) define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) { @@ -897,6 +996,17 @@ define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %pa ret <16 x i16> %3 } +define <16 x i16> @demandedbit_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %a1) { +; CHECK-LABEL: @demandedbit_test_permvar_hi_256_mask( +; CHECK-NEXT: [[M:%.*]] = or <16 x i16> [[A1:%.*]], +; CHECK-NEXT: [[S:%.*]] = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> [[A0:%.*]], <16 x i16> [[M]]) +; CHECK-NEXT: ret <16 x i16> [[S]] +; + %m = or <16 x i16> %a1, + %s = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> %m) + ret <16 x i16> %s +} + declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) { @@ -985,6 +1095,17 @@ define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %pa ret <32 x i16> %3 } +define <32 x i16> @demandedbit_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %a1) { +; CHECK-LABEL: @demandedbit_test_permvar_hi_512_mask( +; CHECK-NEXT: [[M:%.*]] = or <32 x i16> [[A1:%.*]], +; CHECK-NEXT: [[S:%.*]] = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> [[A0:%.*]], <32 x i16> [[M]]) +; CHECK-NEXT: ret <32 x i16> [[S]] +; + %m = or <32 x i16> %a1, + %s = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> %m) + ret <32 x i16> %s +} + declare <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8>, <16 x i8>) define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) { @@ -1073,6 +1194,17 @@ define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passt ret <16 x i8> %3 } +define <16 x i8> @demandedbit_test_permvar_qi_129_mask(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK-LABEL: @demandedbit_test_permvar_qi_129_mask( +; CHECK-NEXT: [[M:%.*]] = or <16 x i8> [[A1:%.*]], +; CHECK-NEXT: [[S:%.*]] = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> [[A0:%.*]], <16 x i8> [[M]]) +; CHECK-NEXT: ret <16 x i8> [[S]] +; + %m = or <16 x i8> %a1, + %s = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> %m) + ret <16 x i8> %s +} + declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>) define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) { @@ -1161,6 +1293,17 @@ define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passt ret <32 x i8> %3 } +define <32 x i8> @demandedbit_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %a1) { +; CHECK-LABEL: @demandedbit_test_permvar_qi_256_mask( +; CHECK-NEXT: [[M:%.*]] = or <32 x i8> [[A1:%.*]], +; CHECK-NEXT: [[S:%.*]] = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> [[A0:%.*]], <32 x i8> [[M]]) +; CHECK-NEXT: ret <32 x i8> [[S]] +; + %m = or <32 x i8> %a1, + %s = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> %m) + ret <32 x i8> %s +} + declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) { @@ -1248,3 +1391,14 @@ define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passt %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru ret <64 x i8> %3 } + +define <64 x i8> @demandedbit_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %a1) { +; CHECK-LABEL: @demandedbit_test_permvar_qi_512_mask( +; CHECK-NEXT: [[M:%.*]] = or <64 x i8> [[A1:%.*]], +; CHECK-NEXT: [[S:%.*]] = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> [[A0:%.*]], <64 x i8> [[M]]) +; CHECK-NEXT: ret <64 x i8> [[S]] +; + %m = or <64 x i8> %a1, + %s = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> %m) + ret <64 x i8> %s +} diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll b/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll index a65358e1033cc6..eb6ad4458d932e 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll @@ -25,6 +25,30 @@ define <2 x i64> @shuffle_vpermv3_v2i64_unary(<2 x i64> %x0) { ret <2 x i64> %r } +define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %m) { +; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[M:%.*]]) { +; CHECK-NEXT: [[T:%.*]] = or <2 x i64> [[M]], +; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]]) +; CHECK-NEXT: ret <2 x i64> [[R]] +; + %t = or <2 x i64> %m, + %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %t, <2 x i64> %x1) + ret <2 x i64> %r +} + +define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %m) { +; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative( +; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[M:%.*]]) { +; CHECK-NEXT: [[T:%.*]] = or <2 x i64> [[M]], +; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]]) +; CHECK-NEXT: ret <2 x i64> [[R]] +; + %t = or <2 x i64> %m, + %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %t, <2 x i64> %x1) + ret <2 x i64> %r +} + define <4 x i64> @shuffle_vpermv3_v4i64(<4 x i64> %x0, <4 x i64> %x1) { ; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64( ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]]) { @@ -45,6 +69,18 @@ define <4 x i64> @shuffle_vpermv3_v4i64_unary(<4 x i64> %x0) { ret <4 x i64> %r } +define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %m) { +; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits( +; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[M:%.*]]) { +; CHECK-NEXT: [[T:%.*]] = or <4 x i64> [[M]], +; CHECK-NEXT: [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[T]], <4 x i64> [[X1]]) +; CHECK-NEXT: ret <4 x i64> [[R]] +; + %t = or <4 x i64> %m, + %r = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %t, <4 x i64> %x1) + ret <4 x i64> %r +} + define <8 x i64> @shuffle_vpermv3_v8i64(<8 x i64> %x0, <8 x i64> %x1) { ; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64( ; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) { @@ -65,6 +101,18 @@ define <8 x i64> @shuffle_vpermv3_v8i64_unary(<8 x i64> %x0) { ret <8 x i64> %r } +define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %m) { +; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits( +; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[M:%.*]]) { +; CHECK-NEXT: [[T:%.*]] = or <8 x i64> [[M]], +; CHECK-NEXT: [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> [[T]], <8 x i64> [[X1]]) +; CHECK-NEXT: ret <8 x i64> [[R]] +; + %t = or <8 x i64> %m, + %r = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %t, <8 x i64> %x1) + ret <8 x i64> %r +} + ; ; vXi32 ; @@ -89,6 +137,18 @@ define <4 x i32> @shuffle_vpermv3_v4i32_unary(<4 x i32> %x0) { ret <4 x i32> %r } +define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %m) { +; CHECK-LABEL: define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits( +; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[M:%.*]]) { +; CHECK-NEXT: [[T:%.*]] = or <4 x i32> [[M]], +; CHECK-NEXT: [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[T]], <4 x i32> [[X1]]) +; CHECK-NEXT: ret <4 x i32> [[R]] +; + %t = or <4 x i32> %m, + %r = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %t, <4 x i32> %x1) + ret <4 x i32> %r +} + define <8 x i32> @shuffle_vpermv3_v8i32(<8 x i32> %x0, <8 x i32> %x1) { ; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32( ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]]) { @@ -109,6 +169,18 @@ define <8 x i32> @shuffle_vpermv3_v8i32_unary(<8 x i32> %x0) { ret <8 x i32> %r } +define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %m) { +; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits( +; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[M:%.*]]) { +; CHECK-NEXT: [[T:%.*]] = or <8 x i32> [[M]], +; CHECK-NEXT: [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[T]], <8 x i32> [[X1]]) +; CHECK-NEXT: ret <8 x i32> [[R]] +; + %t = or <8 x i32> %m, + %r = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %t, <8 x i32> %x1) + ret <8 x i32> %r +} + define <16 x i32> @shuffle_vpermv3_v16i32(<16 x i32> %x0, <16 x i32> %x1) { ; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32( ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) { @@ -129,6 +201,18 @@ define <16 x i32> @shuffle_vpermv3_v16i32_unary(<16 x i32> %x0) { ret <16 x i32> %r } +define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %m) { +; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits( +; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[M:%.*]]) { +; CHECK-NEXT: [[T:%.*]] = or <16 x i32> [[M]], +; CHECK-NEXT: [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> [[T]], <16 x i32> [[X1]]) +; CHECK-NEXT: ret <16 x i32> [[R]] +; + %t = or <16 x i32> %m, + %r = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %t, <16 x i32> %x1) + ret <16 x i32> %r +} + ; ; vXi16 ; @@ -153,6 +237,18 @@ define <8 x i16> @shuffle_vpermv3_v8i16_unary(<8 x i16> %x0) { ret <8 x i16> %r } +define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %m) { +; CHECK-LABEL: define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits( +; CHECK-SAME: <8 x i16> [[X0:%.*]], <8 x i16> [[X1:%.*]], <8 x i16> [[M:%.*]]) { +; CHECK-NEXT: [[T:%.*]] = or <8 x i16> [[M]], +; CHECK-NEXT: [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> [[T]], <8 x i16> [[X1]]) +; CHECK-NEXT: ret <8 x i16> [[R]] +; + %t = or <8 x i16> %m, + %r = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %t, <8 x i16> %x1) + ret <8 x i16> %r +} + define <16 x i16> @shuffle_vpermv3_v16i16(<16 x i16> %x0, <16 x i16> %x1) { ; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16( ; CHECK-SAME: <16 x i16> [[X0:%.*]], <16 x i16> [[X1:%.*]]) { @@ -173,6 +269,18 @@ define <16 x i16> @shuffle_vpermv3_v16i16_unary(<16 x i16> %x0) { ret <16 x i16> %r } +define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %m) { +; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits( +; CHECK-SAME: <16 x i16> [[X0:%.*]], <16 x i16> [[X1:%.*]], <16 x i16> [[M:%.*]]) { +; CHECK-NEXT: [[T:%.*]] = or <16 x i16> [[M]], +; CHECK-NEXT: [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> [[T]], <16 x i16> [[X1]]) +; CHECK-NEXT: ret <16 x i16> [[R]] +; + %t = or <16 x i16> %m, + %r = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %t, <16 x i16> %x1) + ret <16 x i16> %r +} + define <32 x i16> @shuffle_vpermv3_v32i16(<32 x i16> %x0, <32 x i16> %x1) { ; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16( ; CHECK-SAME: <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]]) { @@ -193,6 +301,18 @@ define <32 x i16> @shuffle_vpermv3_v32i16_unary(<32 x i16> %x0) { ret <32 x i16> %r } +define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %m) { +; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits( +; CHECK-SAME: <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]], <32 x i16> [[M:%.*]]) { +; CHECK-NEXT: [[T:%.*]] = or <32 x i16> [[M]], +; CHECK-NEXT: [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> [[T]], <32 x i16> [[X1]]) +; CHECK-NEXT: ret <32 x i16> [[R]] +; + %t = or <32 x i16> %m, + %r = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %t, <32 x i16> %x1) + ret <32 x i16> %r +} + ; ; vXi8 ; @@ -217,6 +337,18 @@ define <16 x i8> @shuffle_vpermv3_v16i8_unary(<16 x i8> %x0) { ret <16 x i8> %r } +define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %m) { +; CHECK-LABEL: define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits( +; CHECK-SAME: <16 x i8> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[M:%.*]]) { +; CHECK-NEXT: [[T:%.*]] = or <16 x i8> [[M]], +; CHECK-NEXT: [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> [[T]], <16 x i8> [[X1]]) +; CHECK-NEXT: ret <16 x i8> [[R]] +; + %t = or <16 x i8> %m, + %r = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %t, <16 x i8> %x1) + ret <16 x i8> %r +} + define <32 x i8> @shuffle_vpermv3_v32i8(<32 x i8> %x0, <32 x i8> %x1) { ; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8( ; CHECK-SAME: <32 x i8> [[X0:%.*]], <32 x i8> [[X1:%.*]]) { @@ -237,6 +369,18 @@ define <32 x i8> @shuffle_vpermv3_v32i8_unary(<32 x i8> %x0) { ret <32 x i8> %r } +define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %m) { +; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits( +; CHECK-SAME: <32 x i8> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[M:%.*]]) { +; CHECK-NEXT: [[T:%.*]] = or <32 x i8> [[M]], +; CHECK-NEXT: [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> [[T]], <32 x i8> [[X1]]) +; CHECK-NEXT: ret <32 x i8> [[R]] +; + %t = or <32 x i8> %m, + %r = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %t, <32 x i8> %x1) + ret <32 x i8> %r +} + define <64 x i8> @shuffle_vpermv3_v64i8(<64 x i8> %x0, <64 x i8> %x1) { ; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8( ; CHECK-SAME: <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]]) { @@ -256,3 +400,15 @@ define <64 x i8> @shuffle_vpermv3_v64i8_unary(<64 x i8> %x0) { %r = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> , <64 x i8> %x0) ret <64 x i8> %r } + +define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %m) { +; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits( +; CHECK-SAME: <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i8> [[M:%.*]]) { +; CHECK-NEXT: [[T:%.*]] = or <64 x i8> [[M]], +; CHECK-NEXT: [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> [[T]], <64 x i8> [[X1]]) +; CHECK-NEXT: ret <64 x i8> [[R]] +; + %t = or <64 x i8> %m, + %r = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %t, <64 x i8> %x1) + ret <64 x i8> %r +} From a3816b5a573dbf57ba3082a919ca2de6b47257e9 Mon Sep 17 00:00:00 2001 From: Patryk Wychowaniec Date: Fri, 30 Aug 2024 16:50:56 +0200 Subject: [PATCH 44/98] [AVR] Fix LLD test (#106739) Since we don't generate relocations for those, it doesn't make sense to assert them here; fallout of https://github.com/llvm/llvm-project/pull/106722. --- lld/test/ELF/avr-reloc.s | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/lld/test/ELF/avr-reloc.s b/lld/test/ELF/avr-reloc.s index ec088eaa149d01..41c32580f63a1c 100644 --- a/lld/test/ELF/avr-reloc.s +++ b/lld/test/ELF/avr-reloc.s @@ -76,32 +76,6 @@ adiw r24, b ; R_AVR_6_ADIW in r20, b ; R_AVR_PORT6 sbic b, 1 ; R_AVR_PORT5 -.section .PCREL,"ax",@progbits -; CHECK-LABEL: section .PCREL -; CHECK: rjmp .+30 -; CHECK-NEXT: rjmp .-36 -; CHECK-NEXT: breq .+26 -; CHECK-NEXT: breq .-40 -; CHECK-NEXT: rjmp .-4096 -; CHECK-NEXT: rjmp .+4094 -; CHECK-NEXT: rjmp .+4094 -; CHECK-NEXT: rjmp .-4096 -; CHECK-NEXT: breq .-128 -; CHECK-NEXT: breq .+126 -; HEX-LABEL: section .PCREL: -; HEX-NEXT: 0fc0eecf 69f061f3 -foo: -rjmp foo + 32 ; R_AVR_13_PCREL -rjmp foo - 32 ; R_AVR_13_PCREL -breq foo + 32 ; R_AVR_7_PCREL -breq foo - 32 ; R_AVR_7_PCREL -rjmp 1f - 4096 $ 1: ; R_AVR_13_PCREL -rjmp 1f + 4094 $ 1: ; R_AVR_13_PCREL -rjmp 1f - 4098 $ 1: ; R_AVR_13_PCREL (overflow) -rjmp 1f + 4096 $ 1: ; R_AVR_13_PCREL (overflow) -breq 1f - 128 $ 1: ; R_AVR_7_PCREL -breq 1f + 126 $ 1: ; R_AVR_7_PCREL - .section .LDSSTS,"ax",@progbits ; CHECK-LABEL: section .LDSSTS: ; CHECK: lds r20, 0x1e From 924907bc6aa17bb15241143dc9858da971b25908 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 30 Aug 2024 07:56:14 -0700 Subject: [PATCH 45/98] [DAG] Prefer 0.0 over -0.0 as neutral value for FADD w/NoSignedZero (#106616) When getting a neutral value, we can prefer using a positive zero over a negative zero if nsz is set on the FADD (or reduction). A positive zero should be cheaper to materialize on basically all targets. Arguably, we should be doing this kind of canonicalization in DAGCombine, but we don't do that for any of the other reduction variants, so this seems like path of least resistance. This does mean that we can only do this for "fast" reductions. Just nsz isn't enough, as that goes through the SEQ_FADD path where the IR level start value isn't folded away. If folks think this is to RISCV specific, let me know. There's a trivial RISCV specific implementation. I went with the generic one as I through this might benefit other targets. --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 +++- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 9efcd3f25797b5..7f57b6db40ef49 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -13267,7 +13267,9 @@ SDValue SelectionDAG::getNeutralElement(unsigned Opcode, const SDLoc &DL, case ISD::SMIN: return getConstant(APInt::getSignedMaxValue(VT.getSizeInBits()), DL, VT); case ISD::FADD: - return getConstantFP(-0.0, DL, VT); + // If flags allow, prefer positive zero single it's generally cheaper + // to materialize on most targets. + return getConstantFP(Flags.hasNoSignedZeros() ? 0.0 : -0.0, DL, VT); case ISD::FMUL: return getConstantFP(1.0, DL, VT); case ISD::FMINNUM: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index 5d5807cbadbad5..4be680e272e5b9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -524,8 +524,7 @@ define float @vreduce_fadd_v7f32_neutralstart_fast(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vmv.s.x v10, zero ; CHECK-NEXT: vfredusum.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret From 49b04e60ed99307b0b4369b8956e6c15c7094d07 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 30 Aug 2024 14:59:10 +0000 Subject: [PATCH 46/98] [gn build] Port 034f2b380bd2 --- llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn | 1 + llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn | 1 + 2 files changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn index 7ff3faf63bedc9..f176d8b94b5322 100644 --- a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn @@ -7,5 +7,6 @@ static_library("SandboxIR") { sources = [ "SandboxIR.cpp", "Tracker.cpp", + "Type.cpp", ] } diff --git a/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn index 2d246eccb872ea..02ef303a6946f3 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn @@ -9,5 +9,6 @@ unittest("SandboxIRTests") { sources = [ "SandboxIRTest.cpp", "TrackerTest.cpp", + "TypesTest.cpp", ] } From 5224f65b44f9873c8298d51233005d4802ff0ba0 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 30 Aug 2024 14:59:11 +0000 Subject: [PATCH 47/98] [gn build] Port 115b87636b9f --- llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn index 3a660a87d8af63..47b03b42d096d2 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn @@ -64,6 +64,7 @@ unittest("SupportTests") { "MemoryBufferRefTest.cpp", "MemoryBufferTest.cpp", "MemoryTest.cpp", + "ModRefTest.cpp", "NativeFormatTests.cpp", "OptimizedStructLayoutTest.cpp", "ParallelTest.cpp", From b4d9c52db474041e417f547b699caeeecfa714cc Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 30 Aug 2024 14:59:12 +0000 Subject: [PATCH 48/98] [gn build] Port bd6531b95086 --- llvm/utils/gn/secondary/llvm/unittests/Transforms/IPO/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/unittests/Transforms/IPO/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Transforms/IPO/BUILD.gn index f5b162dd102320..ad44635f107a16 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Transforms/IPO/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Transforms/IPO/BUILD.gn @@ -10,6 +10,7 @@ unittest("IPOTests") { sources = [ "AttributorTest.cpp", "FunctionSpecializationTest.cpp", + "ImportIDTableTests.cpp", "LowerTypeTests.cpp", "WholeProgramDevirt.cpp", ] From 206b5aff44a95754f6dd7a5696efa024e983ac59 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 Aug 2024 19:11:45 +0400 Subject: [PATCH 49/98] AtomicExpand: Allow incrementally legalizing atomicrmw (#103371) If a lowering changed control flow, resume the legalization loop at the first newly inserted block. This will allow incrementally legalizing atomicrmw and cmpxchg. The AArch64 test might be a bugfix. Previously it would lower the vector FP case as a cmpxchg loop, but cmpxchgs get lowered but previously weren't. Maybe it shouldn't be reporting cmpxchg for the expand type in the first place though. --- llvm/lib/CodeGen/AtomicExpandPass.cpp | 35 +- llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll | 373 +++++++++++--------- llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 373 +++++++++++--------- llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 373 +++++++++++--------- llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll | 373 +++++++++++--------- 5 files changed, 836 insertions(+), 691 deletions(-) diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 39a705599f90cc..b9732e816ea7e6 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -351,17 +351,30 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) { bool MadeChange = false; - SmallVector AtomicInsts; - - // Changing control-flow while iterating through it is a bad idea, so gather a - // list of all atomic instructions before we start. - for (Instruction &I : instructions(F)) - if (I.isAtomic() && !isa(&I)) - AtomicInsts.push_back(&I); - - for (auto *I : AtomicInsts) { - if (processAtomicInstr(I)) - MadeChange = true; + for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE;) { + BasicBlock *BB = &*BBI; + ++BBI; + + BasicBlock::iterator Next; + + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; + I = Next) { + Instruction &Inst = *I; + Next = std::next(I); + + if (processAtomicInstr(&Inst)) { + MadeChange = true; + + // Detect control flow change and resume iteration from the original + // block to inspect any newly inserted blocks. This allows incremental + // legalizaton of atomicrmw and cmpxchg. + if (BB != Next->getParent()) { + BBI = BB->getIterator(); + BBE = F.end(); + break; + } + } + } } return MadeChange; diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll index 0d230bb9dcc6e9..ed9c1b037d0cc7 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -43,46 +43,49 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 -; SOFTFP-NOLSE-NEXT: b .LBB0_1 -; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -128,46 +131,49 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 -; SOFTFP-NOLSE-NEXT: b .LBB1_1 -; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -232,36 +238,40 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 -; SOFTFP-NOLSE-NEXT: b .LBB2_1 -; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2 @@ -327,36 +337,40 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 -; SOFTFP-NOLSE-NEXT: b .LBB3_1 -; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 4 @@ -399,35 +413,38 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldr w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w20 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 -; SOFTFP-NOLSE-NEXT: b .LBB4_1 -; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4 @@ -469,36 +486,40 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: mov x20, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp x8, x20 -; SOFTFP-NOLSE-NEXT: mov x20, x8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: mov w9, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x20 -; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: mov x0, x21 +; SOFTFP-NOLSE-NEXT: mov x1, x20 ; SOFTFP-NOLSE-NEXT: bl __adddf3 -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: cmp x8, x21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 -; SOFTFP-NOLSE-NEXT: b .LBB5_1 -; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8 @@ -687,18 +708,18 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] -; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 -; SOFTFP-NOLSE-NEXT: cmp w8, w21 -; SOFTFP-NOLSE-NEXT: mov w21, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -711,29 +732,33 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] -; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] +; SOFTFP-NOLSE-NEXT: cmp w22, w8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 -; SOFTFP-NOLSE-NEXT: b .LBB7_1 -; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB7_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -799,17 +824,18 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 -; SOFTFP-NOLSE-NEXT: cmp w21, w23 -; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 @@ -819,25 +845,28 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] -; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] +; SOFTFP-NOLSE-NEXT: cmp w22, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 -; SOFTFP-NOLSE-NEXT: b .LBB8_1 -; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -885,45 +914,49 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 -; SOFTFP-NOLSE-NEXT: cmp x23, x8 -; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 -; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB9_6 ; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w23 -; SOFTFP-NOLSE-NEXT: mov w9, w0 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: mov w9, w22 +; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] -; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] +; SOFTFP-NOLSE-NEXT: cmp x22, x9 ; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 -; SOFTFP-NOLSE-NEXT: b .LBB9_1 -; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w23 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB9_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_2 +; SOFTFP-NOLSE-NEXT: .LBB9_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index bfe0d20ca814bc..888b795876f7df 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -45,46 +45,49 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 -; SOFTFP-NOLSE-NEXT: b .LBB0_1 -; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -130,46 +133,49 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 -; SOFTFP-NOLSE-NEXT: b .LBB1_1 -; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -234,36 +240,40 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 -; SOFTFP-NOLSE-NEXT: b .LBB2_1 -; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2 @@ -329,36 +339,40 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 -; SOFTFP-NOLSE-NEXT: b .LBB3_1 -; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 4 @@ -401,35 +415,38 @@ define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldr w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w20 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 -; SOFTFP-NOLSE-NEXT: b .LBB4_1 -; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4 @@ -471,36 +488,40 @@ define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: mov x20, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp x8, x20 -; SOFTFP-NOLSE-NEXT: mov x20, x8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: mov w9, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x20 -; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: mov x0, x21 +; SOFTFP-NOLSE-NEXT: mov x1, x20 ; SOFTFP-NOLSE-NEXT: bl fmax -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: cmp x8, x21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 -; SOFTFP-NOLSE-NEXT: b .LBB5_1 -; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8 @@ -567,18 +588,18 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] -; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 -; SOFTFP-NOLSE-NEXT: cmp w8, w21 -; SOFTFP-NOLSE-NEXT: mov w21, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB6_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB6_6 ; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 @@ -591,29 +612,33 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] -; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] +; SOFTFP-NOLSE-NEXT: cmp w22, w8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB6_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB6_3 -; SOFTFP-NOLSE-NEXT: b .LBB6_1 -; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB6_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_2 +; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -723,17 +748,18 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 -; SOFTFP-NOLSE-NEXT: cmp w21, w23 -; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -743,25 +769,28 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] -; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] +; SOFTFP-NOLSE-NEXT: cmp w22, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 -; SOFTFP-NOLSE-NEXT: b .LBB7_1 -; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -809,45 +838,49 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 -; SOFTFP-NOLSE-NEXT: cmp x23, x8 -; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 -; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: mov w8, w23 -; SOFTFP-NOLSE-NEXT: mov w9, w0 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: mov w9, w22 +; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] -; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] +; SOFTFP-NOLSE-NEXT: cmp x22, x9 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 -; SOFTFP-NOLSE-NEXT: b .LBB8_1 -; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w23 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB8_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index 6b7d2df044460a..a3665c6e428608 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -45,46 +45,49 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 -; SOFTFP-NOLSE-NEXT: b .LBB0_1 -; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -130,46 +133,49 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 -; SOFTFP-NOLSE-NEXT: b .LBB1_1 -; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -234,36 +240,40 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 -; SOFTFP-NOLSE-NEXT: b .LBB2_1 -; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2 @@ -329,36 +339,40 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 -; SOFTFP-NOLSE-NEXT: b .LBB3_1 -; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 4 @@ -401,35 +415,38 @@ define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldr w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w20 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 -; SOFTFP-NOLSE-NEXT: b .LBB4_1 -; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4 @@ -471,36 +488,40 @@ define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: mov x20, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp x8, x20 -; SOFTFP-NOLSE-NEXT: mov x20, x8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: mov w9, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x20 -; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: mov x0, x21 +; SOFTFP-NOLSE-NEXT: mov x1, x20 ; SOFTFP-NOLSE-NEXT: bl fmin -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: cmp x8, x21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 -; SOFTFP-NOLSE-NEXT: b .LBB5_1 -; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8 @@ -567,18 +588,18 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] -; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 -; SOFTFP-NOLSE-NEXT: cmp w8, w21 -; SOFTFP-NOLSE-NEXT: mov w21, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB6_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB6_6 ; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 @@ -591,29 +612,33 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] -; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] +; SOFTFP-NOLSE-NEXT: cmp w22, w8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB6_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB6_3 -; SOFTFP-NOLSE-NEXT: b .LBB6_1 -; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB6_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_2 +; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -723,17 +748,18 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 -; SOFTFP-NOLSE-NEXT: cmp w21, w23 -; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -743,25 +769,28 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] -; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] +; SOFTFP-NOLSE-NEXT: cmp w22, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 -; SOFTFP-NOLSE-NEXT: b .LBB7_1 -; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -809,45 +838,49 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 -; SOFTFP-NOLSE-NEXT: cmp x23, x8 -; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 -; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: mov w8, w23 -; SOFTFP-NOLSE-NEXT: mov w9, w0 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: mov w9, w22 +; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] -; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] +; SOFTFP-NOLSE-NEXT: cmp x22, x9 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 -; SOFTFP-NOLSE-NEXT: b .LBB8_1 -; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w23 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB8_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll index 67e164037d5ce7..7725ce0e731859 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -43,46 +43,49 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 -; SOFTFP-NOLSE-NEXT: b .LBB0_1 -; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 +; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -128,46 +131,49 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w23 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff ; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 -; SOFTFP-NOLSE-NEXT: b .LBB1_1 -; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 +; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -232,36 +238,40 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 -; SOFTFP-NOLSE-NEXT: b .LBB2_1 -; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 +; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2 @@ -327,36 +337,40 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] -; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 +; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] +; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 -; SOFTFP-NOLSE-NEXT: b .LBB3_1 -; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 +; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 4 @@ -399,35 +413,38 @@ define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldr w0, [x0] +; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp w8, w20 -; SOFTFP-NOLSE-NEXT: mov w20, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w20 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] -; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] +; SOFTFP-NOLSE-NEXT: cmp w0, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 -; SOFTFP-NOLSE-NEXT: b .LBB4_1 -; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 +; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4 @@ -469,36 +486,40 @@ define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x21, x1 +; SOFTFP-NOLSE-NEXT: mov x20, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: cmp x8, x20 -; SOFTFP-NOLSE-NEXT: mov x20, x8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 +; SOFTFP-NOLSE-NEXT: mov w9, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x20 -; SOFTFP-NOLSE-NEXT: mov x1, x21 +; SOFTFP-NOLSE-NEXT: mov x0, x21 +; SOFTFP-NOLSE-NEXT: mov x1, x20 ; SOFTFP-NOLSE-NEXT: bl __subdf3 -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: cmp x8, x21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 -; SOFTFP-NOLSE-NEXT: b .LBB5_1 -; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: mov x21, x8 +; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 +; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8 @@ -687,18 +708,18 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] -; SOFTFP-NOLSE-NEXT: mov w22, w1 +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 -; SOFTFP-NOLSE-NEXT: cmp w8, w21 -; SOFTFP-NOLSE-NEXT: mov w21, w8 -; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -711,29 +732,33 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] -; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] +; SOFTFP-NOLSE-NEXT: cmp w22, w8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 -; SOFTFP-NOLSE-NEXT: b .LBB7_1 -; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB7_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 +; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -799,17 +824,18 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 -; SOFTFP-NOLSE-NEXT: cmp w21, w23 -; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 @@ -819,25 +845,28 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] -; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] +; SOFTFP-NOLSE-NEXT: cmp w22, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 -; SOFTFP-NOLSE-NEXT: b .LBB8_1 -; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w21 +; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 +; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -885,45 +914,49 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] +; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %cmpxchg.nostore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 -; SOFTFP-NOLSE-NEXT: cmp x23, x8 -; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 -; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB9_6 ; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w23 -; SOFTFP-NOLSE-NEXT: mov w9, w0 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: mov w8, w0 +; SOFTFP-NOLSE-NEXT: mov w9, w22 +; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] -; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] +; SOFTFP-NOLSE-NEXT: cmp x22, x9 ; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] -; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 -; SOFTFP-NOLSE-NEXT: b .LBB9_1 -; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w23 -; SOFTFP-NOLSE-NEXT: mov w1, w22 +; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB9_3 +; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_2 +; SOFTFP-NOLSE-NEXT: .LBB9_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload From 5703d8572f1bcca7bdcd01f1d83ad98ebb07ced0 Mon Sep 17 00:00:00 2001 From: Brendan Dahl Date: Fri, 30 Aug 2024 08:42:37 -0700 Subject: [PATCH 50/98] [WebAssembly] Add intrinsics to wasm_simd128.h for all FP16 instructions (#106465) Getting this to work required a few additional changes: - Add builtins for any instructions that can't be done with plain C currently. - Add support for the saturating version of fp_to__I16x8. Other vector sizes supported this already. - Support bitcast of f16x8 to v128. Needed to return a __f16x8 as v128_t. --- .../clang/Basic/BuiltinsWebAssembly.def | 9 ++ clang/lib/CodeGen/CGBuiltin.cpp | 12 ++ clang/lib/Headers/wasm_simd128.h | 147 ++++++++++++++++++ .../intrinsic-header-tests/wasm_simd128.c | 138 +++++++++++++++- .../WebAssembly/WebAssemblyISelLowering.cpp | 9 +- .../WebAssembly/WebAssemblyInstrSIMD.td | 28 ++-- .../CodeGen/WebAssembly/half-precision.ll | 18 +++ 7 files changed, 348 insertions(+), 13 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def index 034d32c6291b3d..2e80eef2c8b9bc 100644 --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -124,6 +124,7 @@ TARGET_BUILTIN(__builtin_wasm_bitmask_i16x8, "UiV8s", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_bitmask_i32x4, "UiV4i", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_bitmask_i64x2, "UiV2LLi", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_abs_f16x8, "V8hV8h", "nc", "fp16") TARGET_BUILTIN(__builtin_wasm_abs_f32x4, "V4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_abs_f64x2, "V2dV2d", "nc", "simd128") @@ -140,6 +141,10 @@ TARGET_BUILTIN(__builtin_wasm_max_f16x8, "V8hV8hV8h", "nc", "fp16") TARGET_BUILTIN(__builtin_wasm_pmin_f16x8, "V8hV8hV8h", "nc", "fp16") TARGET_BUILTIN(__builtin_wasm_pmax_f16x8, "V8hV8hV8h", "nc", "fp16") +TARGET_BUILTIN(__builtin_wasm_ceil_f16x8, "V8hV8h", "nc", "fp16") +TARGET_BUILTIN(__builtin_wasm_floor_f16x8, "V8hV8h", "nc", "fp16") +TARGET_BUILTIN(__builtin_wasm_trunc_f16x8, "V8hV8h", "nc", "fp16") +TARGET_BUILTIN(__builtin_wasm_nearest_f16x8, "V8hV8h", "nc", "fp16") TARGET_BUILTIN(__builtin_wasm_ceil_f32x4, "V4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_floor_f32x4, "V4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_trunc_f32x4, "V4fV4f", "nc", "simd128") @@ -151,9 +156,13 @@ TARGET_BUILTIN(__builtin_wasm_nearest_f64x2, "V2dV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_dot_s_i32x4_i16x8, "V4iV8sV8s", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_sqrt_f16x8, "V8hV8h", "nc", "fp16") TARGET_BUILTIN(__builtin_wasm_sqrt_f32x4, "V4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_sqrt_f64x2, "V2dV2d", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_trunc_saturate_s_i16x8_f16x8, "V8sV8h", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_trunc_saturate_u_i16x8_f16x8, "V8sV8h", "nc", "simd128") + TARGET_BUILTIN(__builtin_wasm_trunc_saturate_s_i32x4_f32x4, "V4iV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_trunc_saturate_u_i32x4_f32x4, "V4iV4f", "nc", "simd128") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 4204c8ff276ab1..c9f21f9ded24f4 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -21211,6 +21211,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32_f64: case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f32: case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f64: + case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i16x8_f16x8: case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32x4_f32x4: { Value *Src = EmitScalarExpr(E->getArg(0)); llvm::Type *ResT = ConvertType(E->getType()); @@ -21222,6 +21223,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32_f64: case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f32: case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f64: + case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i16x8_f16x8: case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32x4_f32x4: { Value *Src = EmitScalarExpr(E->getArg(0)); llvm::Type *ResT = ConvertType(E->getType()); @@ -21269,6 +21271,10 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, CGM.getIntrinsic(Intrinsic::wasm_pmax, ConvertType(E->getType())); return Builder.CreateCall(Callee, {LHS, RHS}); } + case WebAssembly::BI__builtin_wasm_ceil_f16x8: + case WebAssembly::BI__builtin_wasm_floor_f16x8: + case WebAssembly::BI__builtin_wasm_trunc_f16x8: + case WebAssembly::BI__builtin_wasm_nearest_f16x8: case WebAssembly::BI__builtin_wasm_ceil_f32x4: case WebAssembly::BI__builtin_wasm_floor_f32x4: case WebAssembly::BI__builtin_wasm_trunc_f32x4: @@ -21279,18 +21285,22 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, case WebAssembly::BI__builtin_wasm_nearest_f64x2: { unsigned IntNo; switch (BuiltinID) { + case WebAssembly::BI__builtin_wasm_ceil_f16x8: case WebAssembly::BI__builtin_wasm_ceil_f32x4: case WebAssembly::BI__builtin_wasm_ceil_f64x2: IntNo = Intrinsic::ceil; break; + case WebAssembly::BI__builtin_wasm_floor_f16x8: case WebAssembly::BI__builtin_wasm_floor_f32x4: case WebAssembly::BI__builtin_wasm_floor_f64x2: IntNo = Intrinsic::floor; break; + case WebAssembly::BI__builtin_wasm_trunc_f16x8: case WebAssembly::BI__builtin_wasm_trunc_f32x4: case WebAssembly::BI__builtin_wasm_trunc_f64x2: IntNo = Intrinsic::trunc; break; + case WebAssembly::BI__builtin_wasm_nearest_f16x8: case WebAssembly::BI__builtin_wasm_nearest_f32x4: case WebAssembly::BI__builtin_wasm_nearest_f64x2: IntNo = Intrinsic::nearbyint; @@ -21489,12 +21499,14 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, CGM.getIntrinsic(Intrinsic::wasm_bitmask, Vec->getType()); return Builder.CreateCall(Callee, {Vec}); } + case WebAssembly::BI__builtin_wasm_abs_f16x8: case WebAssembly::BI__builtin_wasm_abs_f32x4: case WebAssembly::BI__builtin_wasm_abs_f64x2: { Value *Vec = EmitScalarExpr(E->getArg(0)); Function *Callee = CGM.getIntrinsic(Intrinsic::fabs, Vec->getType()); return Builder.CreateCall(Callee, {Vec}); } + case WebAssembly::BI__builtin_wasm_sqrt_f16x8: case WebAssembly::BI__builtin_wasm_sqrt_f32x4: case WebAssembly::BI__builtin_wasm_sqrt_f64x2: { Value *Vec = EmitScalarExpr(E->getArg(0)); diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h index 2327bec52522d2..67d12f6f2cf419 100644 --- a/clang/lib/Headers/wasm_simd128.h +++ b/clang/lib/Headers/wasm_simd128.h @@ -33,6 +33,7 @@ typedef unsigned long long __u64x2 __attribute__((__vector_size__(16), __aligned__(16))); typedef float __f32x4 __attribute__((__vector_size__(16), __aligned__(16))); typedef double __f64x2 __attribute__((__vector_size__(16), __aligned__(16))); +typedef __fp16 __f16x8 __attribute__((__vector_size__(16), __aligned__(16))); typedef signed char __i8x8 __attribute__((__vector_size__(8), __aligned__(8))); typedef unsigned char __u8x8 @@ -1878,6 +1879,152 @@ wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v128_t __a, v128_t __b, v128_t __c) { (__i8x16)__a, (__i8x16)__b, (__i32x4)__c); } +// FP16 intrinsics +#define __FP16_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("fp16"), \ + __min_vector_width__(128))) + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_splat(float __a) { + return (v128_t)__builtin_wasm_splat_f16x8(__a); +} + +static __inline__ float __FP16_FN_ATTRS wasm_f16x8_extract_lane(v128_t __a, + int __i) + __REQUIRE_CONSTANT(__i) { + return __builtin_wasm_extract_lane_f16x8((__f16x8)__a, __i); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_replace_lane(v128_t __a, + int __i, + float __b) + __REQUIRE_CONSTANT(__i) { + return (v128_t)__builtin_wasm_replace_lane_f16x8((__f16x8)__a, __i, __b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_abs(v128_t __a) { + return (v128_t)__builtin_wasm_abs_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_neg(v128_t __a) { + return (v128_t)(-(__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_sqrt(v128_t __a) { + return (v128_t)__builtin_wasm_sqrt_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ceil(v128_t __a) { + return (v128_t)__builtin_wasm_ceil_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_floor(v128_t __a) { + return (v128_t)__builtin_wasm_floor_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_trunc(v128_t __a) { + return (v128_t)__builtin_wasm_trunc_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_nearest(v128_t __a) { + return (v128_t)__builtin_wasm_nearest_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_eq(v128_t __a, v128_t __b) { + return (v128_t)((__f16x8)__a == (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ne(v128_t __a, v128_t __b) { + return (v128_t)((__f16x8)__a != (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_lt(v128_t __a, v128_t __b) { + return (v128_t)((__f16x8)__a < (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_gt(v128_t __a, v128_t __b) { + return (v128_t)((__f16x8)__a > (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_le(v128_t __a, v128_t __b) { + return (v128_t)((__f16x8)__a <= (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ge(v128_t __a, v128_t __b) { + return (v128_t)((__f16x8)__a >= (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_add(v128_t __a, + v128_t __b) { + return (v128_t)((__f16x8)__a + (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_sub(v128_t __a, + v128_t __b) { + return (v128_t)((__f16x8)__a - (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_mul(v128_t __a, + v128_t __b) { + return (v128_t)((__f16x8)__a * (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_div(v128_t __a, + v128_t __b) { + return (v128_t)((__f16x8)__a / (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_min(v128_t __a, + v128_t __b) { + return (v128_t)__builtin_wasm_min_f16x8((__f16x8)__a, (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_max(v128_t __a, + v128_t __b) { + return (v128_t)__builtin_wasm_max_f16x8((__f16x8)__a, (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_pmin(v128_t __a, + v128_t __b) { + return (v128_t)__builtin_wasm_pmin_f16x8((__f16x8)__a, (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_pmax(v128_t __a, + v128_t __b) { + return (v128_t)__builtin_wasm_pmax_f16x8((__f16x8)__a, (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS +wasm_i16x8_trunc_sat_f16x8(v128_t __a) { + return (v128_t)__builtin_wasm_trunc_saturate_s_i16x8_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS +wasm_u16x8_trunc_sat_f16x8(v128_t __a) { + return (v128_t)__builtin_wasm_trunc_saturate_u_i16x8_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_i16x8(v128_t __a) { + return (v128_t) __builtin_convertvector((__i16x8)__a, __f16x8); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_u16x8(v128_t __a) { + return (v128_t) __builtin_convertvector((__u16x8)__a, __f16x8); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_madd(v128_t __a, + v128_t __b, + v128_t __c) { + return (v128_t)__builtin_wasm_relaxed_madd_f16x8((__f16x8)__a, (__f16x8)__b, + (__f16x8)__c); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_nmadd(v128_t __a, + v128_t __b, + v128_t __c) { + return (v128_t)__builtin_wasm_relaxed_nmadd_f16x8((__f16x8)__a, (__f16x8)__b, + (__f16x8)__c); +} + // Deprecated intrinsics static __inline__ v128_t __DEPRECATED_FN_ATTRS("wasm_i8x16_swizzle") diff --git a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c index fb15e0143d3653..b601d90cfcc927 100644 --- a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c +++ b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c @@ -2,7 +2,7 @@ // expected-no-diagnostics // RUN: %clang %s -O2 -S -o - -target wasm32-unknown-unknown \ -// RUN: -msimd128 -mrelaxed-simd -Wcast-qual -Werror | FileCheck %s +// RUN: -msimd128 -mrelaxed-simd -mfp16 -Wcast-qual -Werror | FileCheck %s #include @@ -1385,3 +1385,139 @@ v128_t test_i16x8_relaxed_dot_i8x16_i7x16(v128_t a, v128_t b) { v128_t test_i32x4_relaxed_dot_i8x16_i7x16_add(v128_t a, v128_t b, v128_t c) { return wasm_i32x4_relaxed_dot_i8x16_i7x16_add(a, b, c); } + +// CHECK-LABEL: test_f16x8_splat: +// CHECK: f16x8.splat{{$}} +v128_t test_f16x8_splat(float a) { return wasm_f16x8_splat(a); } + +// CHECK-LABEL: test_f16x8_extract_lane: +// CHECK: f16x8.extract_lane 7{{$}} +int16_t test_f16x8_extract_lane(v128_t a) { + return wasm_f16x8_extract_lane(a, 7); +} + +// CHECK-LABEL: test_f16x8_replace_lane: +// CHECK: f16x8.replace_lane 7{{$}} +v128_t test_f16x8_replace_lane(v128_t a, float b) { + return wasm_f16x8_replace_lane(a, 7, b); +} + +// CHECK-LABEL: test_f16x8_abs: +// CHECK: f16x8.abs{{$}} +v128_t test_f16x8_abs(v128_t a) { return wasm_f16x8_abs(a); } + +// CHECK-LABEL: test_f16x8_neg: +// CHECK: f16x8.neg{{$}} +v128_t test_f16x8_neg(v128_t a) { return wasm_f16x8_neg(a); } + +// CHECK-LABEL: test_f16x8_sqrt: +// CHECK: f16x8.sqrt{{$}} +v128_t test_f16x8_sqrt(v128_t a) { return wasm_f16x8_sqrt(a); } + +// CHECK-LABEL: test_f16x8_ceil: +// CHECK: f16x8.ceil{{$}} +v128_t test_f16x8_ceil(v128_t a) { return wasm_f16x8_ceil(a); } + +// CHECK-LABEL: test_f16x8_floor: +// CHECK: f16x8.floor{{$}} +v128_t test_f16x8_floor(v128_t a) { return wasm_f16x8_floor(a); } + +// CHECK-LABEL: test_f16x8_trunc: +// CHECK: f16x8.trunc{{$}} +v128_t test_f16x8_trunc(v128_t a) { return wasm_f16x8_trunc(a); } + +// CHECK-LABEL: test_f16x8_nearest: +// CHECK: f16x8.nearest{{$}} +v128_t test_f16x8_nearest(v128_t a) { return wasm_f16x8_nearest(a); } + +// CHECK-LABEL: test_f16x8_add: +// CHECK: f16x8.add{{$}} +v128_t test_f16x8_add(v128_t a, v128_t b) { return wasm_f16x8_add(a, b); } + +// CHECK-LABEL: test_f16x8_sub: +// CHECK: f16x8.sub{{$}} +v128_t test_f16x8_sub(v128_t a, v128_t b) { return wasm_f16x8_sub(a, b); } + +// CHECK-LABEL: test_f16x8_mul: +// CHECK: f16x8.mul{{$}} +v128_t test_f16x8_mul(v128_t a, v128_t b) { return wasm_f16x8_mul(a, b); } + +// CHECK-LABEL: test_f16x8_div: +// CHECK: f16x8.div{{$}} +v128_t test_f16x8_div(v128_t a, v128_t b) { return wasm_f16x8_div(a, b); } + +// CHECK-LABEL: test_f16x8_min: +// CHECK: f16x8.min{{$}} +v128_t test_f16x8_min(v128_t a, v128_t b) { return wasm_f16x8_min(a, b); } + +// CHECK-LABEL: test_f16x8_max: +// CHECK: f16x8.max{{$}} +v128_t test_f16x8_max(v128_t a, v128_t b) { return wasm_f16x8_max(a, b); } + +// CHECK-LABEL: test_f16x8_pmin: +// CHECK: f16x8.pmin{{$}} +v128_t test_f16x8_pmin(v128_t a, v128_t b) { return wasm_f16x8_pmin(a, b); } + +// CHECK-LABEL: test_f16x8_pmax: +// CHECK: f16x8.pmax{{$}} +v128_t test_f16x8_pmax(v128_t a, v128_t b) { return wasm_f16x8_pmax(a, b); } + +// CHECK-LABEL: test_f16x8_eq: +// CHECK: f16x8.eq{{$}} +v128_t test_f16x8_eq(v128_t a, v128_t b) { return wasm_f16x8_eq(a, b); } + +// CHECK-LABEL: test_f16x8_ne: +// CHECK: f16x8.ne{{$}} +v128_t test_f16x8_ne(v128_t a, v128_t b) { return wasm_f16x8_ne(a, b); } + +// CHECK-LABEL: test_f16x8_lt: +// CHECK: f16x8.lt{{$}} +v128_t test_f16x8_lt(v128_t a, v128_t b) { return wasm_f16x8_lt(a, b); } + +// CHECK-LABEL: test_f16x8_gt: +// CHECK: f16x8.gt{{$}} +v128_t test_f16x8_gt(v128_t a, v128_t b) { return wasm_f16x8_gt(a, b); } + +// CHECK-LABEL: test_f16x8_le: +// CHECK: f16x8.le{{$}} +v128_t test_f16x8_le(v128_t a, v128_t b) { return wasm_f16x8_le(a, b); } + +// CHECK-LABEL: test_f16x8_ge: +// CHECK: f16x8.ge{{$}} +v128_t test_f16x8_ge(v128_t a, v128_t b) { return wasm_f16x8_ge(a, b); } + +// CHECK-LABEL: test_i16x8_trunc_sat_f16x8: +// CHECK: i16x8.trunc_sat_f16x8_s{{$}} +v128_t test_i16x8_trunc_sat_f16x8(v128_t a) { + return wasm_i16x8_trunc_sat_f16x8(a); +} + +// CHECK-LABEL: test_u16x8_trunc_sat_f16x8: +// CHECK: i16x8.trunc_sat_f16x8_u{{$}} +v128_t test_u16x8_trunc_sat_f16x8(v128_t a) { + return wasm_u16x8_trunc_sat_f16x8(a); +} + +// CHECK-LABEL: test_f16x8_convert_i16x8: +// CHECK: f16x8.convert_i16x8_s{{$}} +v128_t test_f16x8_convert_i16x8(v128_t a) { + return wasm_f16x8_convert_i16x8(a); +} + +// CHECK-LABEL: test_f16x8_convert_u16x8: +// CHECK: f16x8.convert_i16x8_u{{$}} +v128_t test_f16x8_convert_u16x8(v128_t a) { + return wasm_f16x8_convert_u16x8(a); +} + +// CHECK-LABEL: test_f16x8_relaxed_madd: +// CHECK: f16x8.relaxed_madd{{$}} +v128_t test_f16x8_relaxed_madd(v128_t a, v128_t b, v128_t c) { + return wasm_f16x8_relaxed_madd(a, b, c); +} + +// CHECK-LABEL: test_f16x8_relaxed_nmadd: +// CHECK: f16x8.relaxed_nmadd{{$}} +v128_t test_f16x8_relaxed_nmadd(v128_t a, v128_t b, v128_t c) { + return wasm_f16x8_relaxed_nmadd(a, b, c); +} diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 4578ff7f715146..5cc084f3ab1387 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -275,8 +275,12 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setOperationAction(Op, T, Expand); // But saturating fp_to_int converstions are - for (auto Op : {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}) + for (auto Op : {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}) { setOperationAction(Op, MVT::v4i32, Custom); + if (Subtarget->hasFP16()) { + setOperationAction(Op, MVT::v8i16, Custom); + } + } // Support vector extending for (auto T : MVT::integer_fixedlen_vector_valuetypes()) { @@ -2475,6 +2479,9 @@ SDValue WebAssemblyTargetLowering::LowerFP_TO_INT_SAT(SDValue Op, if (ResT == MVT::v4i32 && SatVT == MVT::i32) return Op; + if (ResT == MVT::v8i16 && SatVT == MVT::i16) + return Op; + return SDValue(); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 887278e9c12ef3..da4b8d228f627d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -165,8 +165,9 @@ def F16x8 : Vec { let prefix = "f16x8"; } -// TODO: Include F16x8 here when half precision is better supported. -defvar AllVecs = [I8x16, I16x8, I32x4, I64x2, F32x4, F64x2]; +// TODO: Remove StdVecs when the F16x8 works every where StdVecs is used. +defvar StdVecs = [I8x16, I16x8, I32x4, I64x2, F32x4, F64x2]; +defvar AllVecs = !listconcat(StdVecs, [F16x8]); defvar IntVecs = [I8x16, I16x8, I32x4, I64x2]; //===----------------------------------------------------------------------===// @@ -188,7 +189,7 @@ defm LOAD_V128_A64 : } // Def load patterns from WebAssemblyInstrMemory.td for vector types -foreach vec = AllVecs in { +foreach vec = StdVecs in { defm : LoadPat; } @@ -217,7 +218,7 @@ defm "" : SIMDLoadSplat<16, 8>; defm "" : SIMDLoadSplat<32, 9>; defm "" : SIMDLoadSplat<64, 10>; -foreach vec = AllVecs in { +foreach vec = StdVecs in { defvar inst = "LOAD"#vec.lane_bits#"_SPLAT"; defm : LoadPat, @@ -389,7 +390,7 @@ defm STORE_V128_A64 : } // Def store patterns from WebAssemblyInstrMemory.td for vector types -foreach vec = AllVecs in { +foreach vec = StdVecs in { defm : StorePat; } @@ -513,7 +514,7 @@ defm "" : ConstVec; // Match splat(x) -> const.v128(x, ..., x) -foreach vec = AllVecs in { +foreach vec = StdVecs in { defvar numEls = !div(vec.vt.Size, vec.lane_bits); defvar isFloat = !or(!eq(vec.lane_vt, f32), !eq(vec.lane_vt, f64)); defvar immKind = !if(isFloat, fpimm, imm); @@ -557,7 +558,7 @@ defm SHUFFLE : // Shuffles after custom lowering def wasm_shuffle_t : SDTypeProfile<1, 18, []>; def wasm_shuffle : SDNode<"WebAssemblyISD::SHUFFLE", wasm_shuffle_t>; -foreach vec = AllVecs in { +foreach vec = StdVecs in { // The @llvm.wasm.shuffle intrinsic has immediate arguments that become TargetConstants. def : Pat<(vec.vt (wasm_shuffle (vec.vt V128:$x), (vec.vt V128:$y), (i32 timm:$m0), (i32 timm:$m1), @@ -627,7 +628,7 @@ defm SPLAT_F16x8 : "f16x8.splat\t$dst, $x", "f16x8.splat", 0x120>; // scalar_to_vector leaves high lanes undefined, so can be a splat -foreach vec = AllVecs in +foreach vec = StdVecs in def : Pat<(vec.vt (scalar_to_vector (vec.lane_vt vec.lane_rc:$x))), (!cast("SPLAT_"#vec) $x)>; @@ -880,7 +881,7 @@ defm BITSELECT : SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins), [], "v128.bitselect\t$dst, $v1, $v2, $c", "v128.bitselect", 82>; -foreach vec = AllVecs in +foreach vec = StdVecs in def : Pat<(vec.vt (int_wasm_bitselect (vec.vt V128:$v1), (vec.vt V128:$v2), (vec.vt V128:$c))), (BITSELECT $v1, $v2, $c)>; @@ -906,7 +907,7 @@ def : Pat<(vec.vt (xor (and (xor (vec.vt V128:$v1), (vec.vt V128:$v2)), (BITSELECT $v2, $v1, $c)>; // Also implement vselect in terms of bitselect -foreach vec = AllVecs in +foreach vec = StdVecs in def : Pat<(vec.vt (vselect (vec.int_vt V128:$c), (vec.vt V128:$v1), (vec.vt V128:$v2))), (BITSELECT $v1, $v2, $c)>; @@ -916,7 +917,7 @@ defm SELECT_V128 : I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs, I32:$cond), (outs), (ins), [], "v128.select\t$dst, $lhs, $rhs, $cond", "v128.select", 0x1b>; -foreach vec = AllVecs in { +foreach vec = StdVecs in { def : Pat<(select I32:$cond, (vec.vt V128:$lhs), (vec.vt V128:$rhs)), (SELECT_V128 $lhs, $rhs, $cond)>; @@ -1370,6 +1371,11 @@ def trunc_u_sat32 : PatFrag<(ops node:$x), (fp_to_uint_sat $x, i32)>; def : Pat<(v4i32 (trunc_s_sat32 (v4f32 V128:$src))), (fp_to_sint_I32x4 $src)>; def : Pat<(v4i32 (trunc_u_sat32 (v4f32 V128:$src))), (fp_to_uint_I32x4 $src)>; +def trunc_s_sat16 : PatFrag<(ops node:$x), (fp_to_sint_sat $x, i16)>; +def trunc_u_sat16 : PatFrag<(ops node:$x), (fp_to_uint_sat $x, i16)>; +def : Pat<(v8i16 (trunc_s_sat16 (v8f16 V128:$src))), (fp_to_sint_I16x8 $src)>; +def : Pat<(v8i16 (trunc_u_sat16 (v8f16 V128:$src))), (fp_to_uint_I16x8 $src)>; + def trunc_sat_zero_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; def trunc_sat_zero_s : SDNode<"WebAssemblyISD::TRUNC_SAT_ZERO_S", trunc_sat_zero_t>; diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll index adba502335f86c..c0b14d2064d5eb 100644 --- a/llvm/test/CodeGen/WebAssembly/half-precision.ll +++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll @@ -290,3 +290,21 @@ define <8 x i16> @trunc_sat_u_v8i16(<8 x half> %x) { %a = fptoui <8 x half> %x to <8 x i16> ret <8 x i16> %a } + +define <8 x i16> @trunc_sat_s_v8i16_sat(<8 x half> %x) { +; CHECK-LABEL: trunc_sat_s_v8i16_sat: +; CHECK: .functype trunc_sat_s_v8i16_sat (v128) -> (v128) +; CHECK-NEXT: i16x8.trunc_sat_f16x8_s $push0=, $0 +; CHECK-NEXT: return $pop[[R]]{{$}} + %a = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> %x) + ret <8 x i16> %a +} + +define <8 x i16> @trunc_sat_u_v8i16_sat(<8 x half> %x) { +; CHECK-LABEL: trunc_sat_u_v8i16_sat: +; CHECK: .functype trunc_sat_u_v8i16_sat (v128) -> (v128) +; CHECK-NEXT: i16x8.trunc_sat_f16x8_u $push0=, $0 +; CHECK-NEXT: return $pop[[R]]{{$}} + %a = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> %x) + ret <8 x i16> %a +} From c55e24b8507d47a8cc04b5d9570e8e3d02be1ca3 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 30 Aug 2024 15:29:45 +0000 Subject: [PATCH 51/98] [llvm][LoongArch] Fix BSTRINS_D test failures on 32 bit hosts eaf87d32754beb5bec10bab517bf56e25575b48e added new code that uses 64 bit types and ULL for constants, mostly, but a few UL snuck in. UL is still 4 bytes on 32 bit, ULL is 8. This fixes test failures on 32 bit Arm: https://lab.llvm.org/buildbot/#/builders/39/builds/1338 --- llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp index 1ce1a9845db21c..6ad2c003558a51 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp @@ -97,7 +97,7 @@ LoongArchMatInt::InstSeq LoongArchMatInt::generateInstSeq(int64_t Val) { for (uint64_t Lsb = Msb; Lsb > 0; --Lsb) { uint64_t LowMask = (1ULL << Lsb) - 1; uint64_t Mask = HighMask | LowMask; - uint64_t LsbToZero = TmpVal1 & ((1UL << (Msb - Lsb + 1)) - 1); + uint64_t LsbToZero = TmpVal1 & ((1ULL << (Msb - Lsb + 1)) - 1); uint64_t MsbToLsb = LsbToZero << Lsb; if ((MsbToLsb | (TmpVal1 & Mask)) == (uint64_t)Val) { if (Insts[1].Opc == LoongArch::ORI && N == 3) @@ -107,7 +107,7 @@ LoongArchMatInt::InstSeq LoongArchMatInt::generateInstSeq(int64_t Val) { return Insts; } if (TmpVal2 != 0) { - LsbToZero = TmpVal2 & ((1UL << (Msb - Lsb + 1)) - 1); + LsbToZero = TmpVal2 & ((1ULL << (Msb - Lsb + 1)) - 1); MsbToLsb = LsbToZero << Lsb; if ((MsbToLsb | (TmpVal2 & Mask)) == (uint64_t)Val) { Insts[0] = Insts[1]; From d58d105cdaf366d7db3f60d356b21bc8e64666fb Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 30 Aug 2024 16:49:23 +0100 Subject: [PATCH 52/98] [Analysis] isTriviallyVectorizable - add vectorization support for acos/asin/atan and cosh/sinh/tanh intrinsics (#106584) Show fallback cases in amdlibm tests where it doesn't have that specific op --- llvm/lib/Analysis/VectorUtils.cpp | 6 ++ .../LoopVectorize/X86/amdlibm-calls.ll | 13 ++++ ...ccelerate-vector-functions-inseltpoison.ll | 72 ++++++++----------- .../AArch64/accelerate-vector-functions.ll | 72 ++++++++----------- 4 files changed, 79 insertions(+), 84 deletions(-) diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index cc742ab35f4498..32ce34114b2f50 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -66,9 +66,15 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::umul_fix: case Intrinsic::umul_fix_sat: case Intrinsic::sqrt: // Begin floating-point. + case Intrinsic::asin: + case Intrinsic::acos: + case Intrinsic::atan: case Intrinsic::sin: case Intrinsic::cos: case Intrinsic::tan: + case Intrinsic::sinh: + case Intrinsic::cosh: + case Intrinsic::tanh: case Intrinsic::exp: case Intrinsic::exp2: case Intrinsic::log: diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll index 04289d43f40e2f..c051e2f18380bd 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll @@ -414,6 +414,7 @@ for.end: define void @acos_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @acos_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_acosf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_acosf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_acosf(<16 x float> [[TMP4:%.*]]) @@ -487,7 +488,10 @@ for.end: define void @asin_f64_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @asin_f64_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @llvm.asin.v2f64(<2 x double> [[TMP4:%.*]]) +; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @llvm.asin.v4f64(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_asin(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.asin.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -510,6 +514,7 @@ for.end: define void @asin_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @asin_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_asinf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_asinf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_asinf(<16 x float> [[TMP4:%.*]]) @@ -588,6 +593,7 @@ define void @atan_f64_intrinsic(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_atan(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_atan(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_atan(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.atan.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -610,6 +616,7 @@ for.end: define void @atan_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @atan_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_atanf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_atanf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_atanf(<16 x float> [[TMP4:%.*]]) @@ -683,6 +690,9 @@ for.end: define void @cosh_f64_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @cosh_f64_intrinsic( ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_cosh(<2 x double> [[TMP4:%.*]]) +; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @llvm.cosh.v4f64(<4 x double> [[TMP4:%.*]]) +; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @llvm.cosh.v8f64(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.cosh.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -705,8 +715,10 @@ for.end: define void @cosh_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @cosh_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_coshf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_coshf(<8 x float> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @llvm.cosh.v16f32(<16 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -754,6 +766,7 @@ for.end: define void @tanh_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @tanh_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanhf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_tanhf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_tanhf(<16 x float> [[TMP4:%.*]]) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll index 809059034c7f98..da827a5b674d8b 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll @@ -627,13 +627,11 @@ define <4 x float> @int_asin_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -708,13 +706,11 @@ define <4 x float> @int_acos_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -789,13 +785,11 @@ define <4 x float> @int_atan_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -870,13 +864,11 @@ define <4 x float> @int_sinh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -951,13 +943,11 @@ define <4 x float> @int_cosh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -1032,13 +1022,11 @@ define <4 x float> @int_tanh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll index 36633a1053b14f..62b8c0ce3291a6 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll @@ -627,13 +627,11 @@ define <4 x float> @int_asin_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -708,13 +706,11 @@ define <4 x float> @int_acos_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -789,13 +785,11 @@ define <4 x float> @int_atan_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -870,13 +864,11 @@ define <4 x float> @int_sinh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -951,13 +943,11 @@ define <4 x float> @int_cosh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -1032,13 +1022,11 @@ define <4 x float> @int_tanh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 From 68f0d20a9b507383a7577144bbd4811abe787e42 Mon Sep 17 00:00:00 2001 From: JoelWee <32009741+JoelWee@users.noreply.github.com> Date: Fri, 30 Aug 2024 16:58:19 +0100 Subject: [PATCH 53/98] Fix clang after ece6566048086cf2870d2c2bff46384df1b9e531 --- mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp index f93e1cc8780c79..9cc66207660f64 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp @@ -152,6 +152,7 @@ static unsigned getUnidirectionalFenceProxyID(NVVM::ProxyKind fromProxy, } llvm_unreachable("Unknown scope for uni-directional fence.proxy operation"); } + llvm_unreachable("Unsupported proxy kinds"); } namespace { From a2615ad45c73095dfda6ae546de107aacb10cbb7 Mon Sep 17 00:00:00 2001 From: JoelWee <32009741+JoelWee@users.noreply.github.com> Date: Fri, 30 Aug 2024 16:58:41 +0100 Subject: [PATCH 54/98] [mlir] Align mlir::Block (#106717) This fixes an error from the LatticeAnchor PointerUnion with ProgramPoint in b6603e1bf1 ``` third_party/llvm/llvm-project/llvm/include/llvm/ADT/PointerIntPair.h:172:17: error: static assertion failed due to requirement '2U <= PointerUnionUIntTraits::NumLowBitsAvailable': PointerIntPair with integer size too large for pointer 172 | static_assert(IntBits <= PtrTraits::NumLowBitsAvailable, ``` --- mlir/include/mlir/IR/Block.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/IR/Block.h b/mlir/include/mlir/IR/Block.h index e4fddfcb7608e6..67825eb1704bbe 100644 --- a/mlir/include/mlir/IR/Block.h +++ b/mlir/include/mlir/IR/Block.h @@ -27,8 +27,8 @@ template class ValueTypeRange; /// `Block` represents an ordered list of `Operation`s. -class Block : public IRObjectWithUseList, - public llvm::ilist_node_with_parent { +class alignas(8) Block : public IRObjectWithUseList, + public llvm::ilist_node_with_parent { public: explicit Block() = default; ~Block(); From 97122550961944f2376f0e84a73cdd5b9e042bc4 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Fri, 30 Aug 2024 12:02:46 -0400 Subject: [PATCH 55/98] Fix a minor issue with the documentation; NFC --- clang-tools-extra/docs/clang-tidy/Contributing.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/clang-tools-extra/docs/clang-tidy/Contributing.rst b/clang-tools-extra/docs/clang-tidy/Contributing.rst index b04809c3308f17..d5303418b859b2 100644 --- a/clang-tools-extra/docs/clang-tidy/Contributing.rst +++ b/clang-tools-extra/docs/clang-tidy/Contributing.rst @@ -340,6 +340,7 @@ expression incrementally and use :program:`clang-query`'s ``let`` command to sav matching expressions to simplify your matcher. .. code-block:: console + clang-query> let c1 cxxRecordDecl() clang-query> match c1 From 1faa9c8a023fb42fda31fa1e6dd6d6d462fb7619 Mon Sep 17 00:00:00 2001 From: Paul T Robinson Date: Fri, 30 Aug 2024 09:08:47 -0700 Subject: [PATCH 56/98] =?UTF-8?q?[Security]=20Nominate=20Matthew=20Voss=20?= =?UTF-8?q?to=20replace=20Paul=20Robinson=20on=20the=20Secu=E2=80=A6=20(#1?= =?UTF-8?q?06112)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …rity Group Matthew is a member of Sony's PS4/PS5 toolchain team, most visible for his work on LTO, but he also has a long-standing interest in security. He will replace Paul as one of Sony's participants in the Security Group as Paul will be retiring from Sony at the end of September. --- llvm/docs/Security.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/docs/Security.rst b/llvm/docs/Security.rst index 9bd2b1d435fd0a..2b5b5139858e7f 100644 --- a/llvm/docs/Security.rst +++ b/llvm/docs/Security.rst @@ -46,9 +46,9 @@ username for an individual isn't available, the brackets will be empty. * Josh Stone (Red Hat; Rust) [@cuviper] * Kristof Beyls (ARM) [@kbeyls] * Matthew Riley (Google) [@mmdriley] +* Matthew Voss (Sony) [@ormris] * Nikhil Gupta (Nvidia) [] * Oliver Hunt (Apple) [@ojhunt] -* Paul Robinson (Sony) [@pogo59] * Peter Smith (ARM) [@smithp35] * Pietro Albini (Ferrous Systems; Rust) [@pietroalbini] * Serge Guelton (Mozilla) [@serge-sans-paille] From 348e74139ad7f06fdf8b332a81842de4bdf03b0c Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Fri, 30 Aug 2024 12:07:07 -0400 Subject: [PATCH 57/98] [libc++][NFC] Run clang-format on libcxx/include This re-formats a few headers that had become out-of-sync with respect to formatting since we ran clang-format on the whole codebase. There's surprisingly few instances of it. --- libcxx/include/__chrono/leap_second.h | 2 +- .../include/__chrono/parser_std_format_spec.h | 2 +- libcxx/include/__chrono/statically_widen.h | 2 +- libcxx/include/__chrono/time_zone_link.h | 2 +- libcxx/include/__expected/expected.h | 37 +++++++++-------- libcxx/include/__format/buffer.h | 2 +- libcxx/include/__format/concepts.h | 4 +- libcxx/include/__format/container_adaptor.h | 2 +- libcxx/include/__format/enable_insertable.h | 2 +- .../include/__format/escaped_output_table.h | 2 +- .../extended_grapheme_cluster_table.h | 2 +- libcxx/include/__format/format_arg.h | 2 +- libcxx/include/__format/format_arg_store.h | 2 +- libcxx/include/__format/format_args.h | 2 +- libcxx/include/__format/format_context.h | 2 +- libcxx/include/__format/format_error.h | 2 +- libcxx/include/__format/format_functions.h | 4 +- .../include/__format/format_parse_context.h | 2 +- libcxx/include/__format/format_string.h | 2 +- libcxx/include/__format/format_to_n_result.h | 2 +- libcxx/include/__format/formatter_bool.h | 4 +- libcxx/include/__format/formatter_char.h | 4 +- .../__format/formatter_floating_point.h | 4 +- libcxx/include/__format/formatter_integer.h | 4 +- libcxx/include/__format/formatter_integral.h | 2 +- libcxx/include/__format/formatter_output.h | 2 +- libcxx/include/__format/formatter_pointer.h | 4 +- libcxx/include/__format/formatter_string.h | 4 +- libcxx/include/__format/formatter_tuple.h | 2 +- .../__format/indic_conjunct_break_table.h | 2 +- .../include/__format/parser_std_format_spec.h | 2 +- .../__format/range_default_formatter.h | 2 +- libcxx/include/__format/range_formatter.h | 2 +- libcxx/include/__format/unicode.h | 2 +- .../include/__format/width_estimation_table.h | 2 +- libcxx/include/__fwd/format.h | 2 +- libcxx/include/__memory/allocator.h | 2 +- .../include/__type_traits/is_member_pointer.h | 4 +- libcxx/include/__type_traits/is_void.h | 4 +- libcxx/include/array | 3 +- libcxx/include/forward_list | 3 +- libcxx/include/iosfwd | 4 +- libcxx/include/list | 5 +-- libcxx/include/set | 3 +- libcxx/include/string | 6 +-- libcxx/include/syncstream | 6 +-- libcxx/include/tuple | 40 +++++++++---------- libcxx/include/vector | 3 +- libcxx/modules/std/format.inc | 2 +- libcxx/src/include/refstring.h | 2 +- libcxx/utils/generate_escaped_output_table.py | 2 +- ...enerate_extended_grapheme_cluster_table.py | 2 +- .../generate_indic_conjunct_break_table.py | 2 +- .../utils/generate_width_estimation_table.py | 2 +- 54 files changed, 105 insertions(+), 113 deletions(-) diff --git a/libcxx/include/__chrono/leap_second.h b/libcxx/include/__chrono/leap_second.h index d79111ed8eecfc..be3ab4235da3ca 100644 --- a/libcxx/include/__chrono/leap_second.h +++ b/libcxx/include/__chrono/leap_second.h @@ -122,7 +122,7 @@ class leap_second { } // namespace chrono -# endif //_LIBCPP_STD_VER >= 20 +# endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__chrono/parser_std_format_spec.h b/libcxx/include/__chrono/parser_std_format_spec.h index 785bbae198e464..6803d03ad882fd 100644 --- a/libcxx/include/__chrono/parser_std_format_spec.h +++ b/libcxx/include/__chrono/parser_std_format_spec.h @@ -409,7 +409,7 @@ class _LIBCPP_TEMPLATE_VIS __parser_chrono { } // namespace __format_spec -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__chrono/statically_widen.h b/libcxx/include/__chrono/statically_widen.h index a18c46f057a819..680483a59ac2c4 100644 --- a/libcxx/include/__chrono/statically_widen.h +++ b/libcxx/include/__chrono/statically_widen.h @@ -45,7 +45,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* __statically_widen(const char* __s # define _LIBCPP_STATICALLY_WIDEN(_CharT, __str) ::std::__statically_widen<_CharT>(__str) # endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__chrono/time_zone_link.h b/libcxx/include/__chrono/time_zone_link.h index b2d365c5fd0820..7b15f6ae39278e 100644 --- a/libcxx/include/__chrono/time_zone_link.h +++ b/libcxx/include/__chrono/time_zone_link.h @@ -68,7 +68,7 @@ operator<=>(const time_zone_link& __x, const time_zone_link& __y) noexcept { } // namespace chrono -# endif //_LIBCPP_STD_VER >= 20 +# endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h index e47ec2f28844b6..8661d5d6e9b939 100644 --- a/libcxx/include/__expected/expected.h +++ b/libcxx/include/__expected/expected.h @@ -503,25 +503,24 @@ class expected : private __expected_base<_Tp, _Err> { private: template - using __can_convert = - _And< is_constructible<_Tp, _UfQual>, - is_constructible<_Err, _OtherErrQual>, - _If<_Not, bool>>::value, - _And< - _Not<_And, is_same<_Err, _OtherErr>>>, // use the copy constructor instead, see #92676 - _Not&>>, - _Not>>, - _Not&>>, - _Not>>, - _Not&, _Tp>>, - _Not&&, _Tp>>, - _Not&, _Tp>>, - _Not&&, _Tp>>>, - true_type>, - _Not, expected<_Up, _OtherErr>&>>, - _Not, expected<_Up, _OtherErr>>>, - _Not, const expected<_Up, _OtherErr>&>>, - _Not, const expected<_Up, _OtherErr>>> >; + using __can_convert = _And< + is_constructible<_Tp, _UfQual>, + is_constructible<_Err, _OtherErrQual>, + _If<_Not, bool>>::value, + _And< _Not<_And, is_same<_Err, _OtherErr>>>, // use the copy constructor instead, see #92676 + _Not&>>, + _Not>>, + _Not&>>, + _Not>>, + _Not&, _Tp>>, + _Not&&, _Tp>>, + _Not&, _Tp>>, + _Not&&, _Tp>>>, + true_type>, + _Not, expected<_Up, _OtherErr>&>>, + _Not, expected<_Up, _OtherErr>>>, + _Not, const expected<_Up, _OtherErr>&>>, + _Not, const expected<_Up, _OtherErr>>> >; template _LIBCPP_HIDE_FROM_ABI constexpr explicit expected( diff --git a/libcxx/include/__format/buffer.h b/libcxx/include/__format/buffer.h index 8598f0a1c03957..ce9ac0c81e315a 100644 --- a/libcxx/include/__format/buffer.h +++ b/libcxx/include/__format/buffer.h @@ -646,7 +646,7 @@ class _LIBCPP_TEMPLATE_VIS __retarget_buffer { } // namespace __format -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/concepts.h b/libcxx/include/__format/concepts.h index 13380e9b91aff8..737783ed4bdeca 100644 --- a/libcxx/include/__format/concepts.h +++ b/libcxx/include/__format/concepts.h @@ -75,8 +75,8 @@ template concept __fmt_pair_like = __is_specialization_v<_Tp, pair> || (__is_specialization_v<_Tp, tuple> && tuple_size_v<_Tp> == 2); -# endif //_LIBCPP_STD_VER >= 23 -#endif //_LIBCPP_STD_VER >= 20 +# endif // _LIBCPP_STD_VER >= 23 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/container_adaptor.h b/libcxx/include/__format/container_adaptor.h index 9f49ca03bf4f50..d3be2e18956046 100644 --- a/libcxx/include/__format/container_adaptor.h +++ b/libcxx/include/__format/container_adaptor.h @@ -66,7 +66,7 @@ template _Container> struct _LIBCPP_TEMPLATE_VIS formatter, _CharT> : public __formatter_container_adaptor, _CharT> {}; -#endif //_LIBCPP_STD_VER >= 23 +#endif // _LIBCPP_STD_VER >= 23 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/enable_insertable.h b/libcxx/include/__format/enable_insertable.h index 86ef94a325b192..29fe566ff06a3f 100644 --- a/libcxx/include/__format/enable_insertable.h +++ b/libcxx/include/__format/enable_insertable.h @@ -28,7 +28,7 @@ inline constexpr bool __enable_insertable = false; } // namespace __format -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/escaped_output_table.h b/libcxx/include/__format/escaped_output_table.h index f7be2dc61f21a3..bdf86cb6f99ccb 100644 --- a/libcxx/include/__format/escaped_output_table.h +++ b/libcxx/include/__format/escaped_output_table.h @@ -856,7 +856,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[711] = { // clang-format on } // namespace __escaped_output_table -#endif //_LIBCPP_STD_VER >= 23 +#endif // _LIBCPP_STD_VER >= 23 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/extended_grapheme_cluster_table.h b/libcxx/include/__format/extended_grapheme_cluster_table.h index 48581d8a5dde3d..7dbc239f5f5cd6 100644 --- a/libcxx/include/__format/extended_grapheme_cluster_table.h +++ b/libcxx/include/__format/extended_grapheme_cluster_table.h @@ -1656,7 +1656,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[1496] = { } // namespace __extended_grapheme_custer_property_boundary -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/format_arg.h b/libcxx/include/__format/format_arg.h index aa02f81dc40e2d..d1ce055874413e 100644 --- a/libcxx/include/__format/format_arg.h +++ b/libcxx/include/__format/format_arg.h @@ -392,7 +392,7 @@ _LIBCPP_DEPRECATED_IN_CXX26 } } -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/format_arg_store.h b/libcxx/include/__format/format_arg_store.h index 23a599e9957599..00de1c30b8733b 100644 --- a/libcxx/include/__format/format_arg_store.h +++ b/libcxx/include/__format/format_arg_store.h @@ -259,7 +259,7 @@ struct _LIBCPP_TEMPLATE_VIS __format_arg_store { _Storage __storage; }; -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/format_args.h b/libcxx/include/__format/format_args.h index 07923570f38930..e19b4458e41a5b 100644 --- a/libcxx/include/__format/format_args.h +++ b/libcxx/include/__format/format_args.h @@ -71,7 +71,7 @@ class _LIBCPP_TEMPLATE_VIS basic_format_args { template basic_format_args(__format_arg_store<_Context, _Args...>) -> basic_format_args<_Context>; -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/format_context.h b/libcxx/include/__format/format_context.h index 71783c55d72540..a9be17b855837d 100644 --- a/libcxx/include/__format/format_context.h +++ b/libcxx/include/__format/format_context.h @@ -212,7 +212,7 @@ class _LIBCPP_TEMPLATE_VIS basic_format_context= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/format_error.h b/libcxx/include/__format/format_error.h index ed40e395d6af72..35a39ee82f3daf 100644 --- a/libcxx/include/__format/format_error.h +++ b/libcxx/include/__format/format_error.h @@ -43,7 +43,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_format_error(const ch # endif } -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h index d14b49aff14957..1518ab5768d243 100644 --- a/libcxx/include/__format/format_functions.h +++ b/libcxx/include/__format/format_functions.h @@ -360,7 +360,7 @@ _LIBCPP_HIDE_FROM_ABI inline __runtime_format_string runtime_format(wst return __fmt; } # endif -# endif //_LIBCPP_STD_VER >= 26 +# endif // _LIBCPP_STD_VER >= 26 template struct _LIBCPP_TEMPLATE_VIS basic_format_string { @@ -671,7 +671,7 @@ formatted_size(locale __loc, wformat_string<_Args...> __fmt, _Args&&... __args) # endif // _LIBCPP_HAS_NO_LOCALIZATION -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/format_parse_context.h b/libcxx/include/__format/format_parse_context.h index aefcd5497f3b9b..54c23014e7dc60 100644 --- a/libcxx/include/__format/format_parse_context.h +++ b/libcxx/include/__format/format_parse_context.h @@ -98,7 +98,7 @@ using format_parse_context = basic_format_parse_context; using wformat_parse_context = basic_format_parse_context; # endif -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/format_string.h b/libcxx/include/__format/format_string.h index bdf3cff7f49b18..a499afee8874a5 100644 --- a/libcxx/include/__format/format_string.h +++ b/libcxx/include/__format/format_string.h @@ -153,7 +153,7 @@ __parse_arg_id(_Iterator __begin, _Iterator __end, auto& __parse_ctx) { } // namespace __format -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/format_to_n_result.h b/libcxx/include/__format/format_to_n_result.h index 6f30546dec081c..344299e32f0ee6 100644 --- a/libcxx/include/__format/format_to_n_result.h +++ b/libcxx/include/__format/format_to_n_result.h @@ -28,7 +28,7 @@ struct _LIBCPP_TEMPLATE_VIS format_to_n_result { }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(format_to_n_result); -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/formatter_bool.h b/libcxx/include/__format/formatter_bool.h index 63aa815efbe9b3..a43eba53c93701 100644 --- a/libcxx/include/__format/formatter_bool.h +++ b/libcxx/include/__format/formatter_bool.h @@ -72,8 +72,8 @@ struct _LIBCPP_TEMPLATE_VIS formatter { # if _LIBCPP_STD_VER >= 23 template <> inline constexpr bool enable_nonlocking_formatter_optimization = true; -# endif //_LIBCPP_STD_VER >= 23 -#endif //_LIBCPP_STD_VER >= 20 +# endif // _LIBCPP_STD_VER >= 23 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/formatter_char.h b/libcxx/include/__format/formatter_char.h index abfd65a4282989..a96acba08d5ca5 100644 --- a/libcxx/include/__format/formatter_char.h +++ b/libcxx/include/__format/formatter_char.h @@ -92,9 +92,9 @@ inline constexpr bool enable_nonlocking_formatter_optimization = true; template <> inline constexpr bool enable_nonlocking_formatter_optimization = true; # endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS -# endif //_LIBCPP_STD_VER >= 23 +# endif // _LIBCPP_STD_VER >= 23 -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/formatter_floating_point.h b/libcxx/include/__format/formatter_floating_point.h index 334755f4e8143b..fc95dd3f22bbe7 100644 --- a/libcxx/include/__format/formatter_floating_point.h +++ b/libcxx/include/__format/formatter_floating_point.h @@ -781,8 +781,8 @@ template <> inline constexpr bool enable_nonlocking_formatter_optimization = true; template <> inline constexpr bool enable_nonlocking_formatter_optimization = true; -# endif //_LIBCPP_STD_VER >= 23 -#endif //_LIBCPP_STD_VER >= 20 +# endif // _LIBCPP_STD_VER >= 23 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/formatter_integer.h b/libcxx/include/__format/formatter_integer.h index 2c2e7995053671..b7f46014c57231 100644 --- a/libcxx/include/__format/formatter_integer.h +++ b/libcxx/include/__format/formatter_integer.h @@ -118,8 +118,8 @@ inline constexpr bool enable_nonlocking_formatter_optimization inline constexpr bool enable_nonlocking_formatter_optimization<__uint128_t> = true; # endif -# endif //_LIBCPP_STD_VER >= 23 -#endif //_LIBCPP_STD_VER >= 20 +# endif // _LIBCPP_STD_VER >= 23 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/formatter_integral.h b/libcxx/include/__format/formatter_integral.h index eca966f8886f84..beed3ab8d93df1 100644 --- a/libcxx/include/__format/formatter_integral.h +++ b/libcxx/include/__format/formatter_integral.h @@ -436,7 +436,7 @@ __format_bool(bool __value, _FormatContext& __ctx, __format_spec::__parsed_speci } // namespace __formatter -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/formatter_output.h b/libcxx/include/__format/formatter_output.h index 1498f64c4aeff7..34c4c87313a450 100644 --- a/libcxx/include/__format/formatter_output.h +++ b/libcxx/include/__format/formatter_output.h @@ -326,7 +326,7 @@ _LIBCPP_HIDE_FROM_ABI int __truncate(basic_string_view<_CharT>& __str, int __pre } // namespace __formatter -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/formatter_pointer.h b/libcxx/include/__format/formatter_pointer.h index e1c062cec6ed2b..6e0fa9a1b4f196 100644 --- a/libcxx/include/__format/formatter_pointer.h +++ b/libcxx/include/__format/formatter_pointer.h @@ -72,8 +72,8 @@ template <> inline constexpr bool enable_nonlocking_formatter_optimization = true; template <> inline constexpr bool enable_nonlocking_formatter_optimization = true; -# endif //_LIBCPP_STD_VER >= 23 -#endif //_LIBCPP_STD_VER >= 20 +# endif // _LIBCPP_STD_VER >= 23 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/formatter_string.h b/libcxx/include/__format/formatter_string.h index dee2b3ad073a51..b29e97847f0ba1 100644 --- a/libcxx/include/__format/formatter_string.h +++ b/libcxx/include/__format/formatter_string.h @@ -167,8 +167,8 @@ inline constexpr bool enable_nonlocking_formatter_optimization inline constexpr bool enable_nonlocking_formatter_optimization> = true; # endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS -# endif //_LIBCPP_STD_VER >= 23 -#endif //_LIBCPP_STD_VER >= 20 +# endif // _LIBCPP_STD_VER >= 23 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/formatter_tuple.h b/libcxx/include/__format/formatter_tuple.h index 030097a8797dae..bb841ef11440dd 100644 --- a/libcxx/include/__format/formatter_tuple.h +++ b/libcxx/include/__format/formatter_tuple.h @@ -143,7 +143,7 @@ template <__fmt_char_type _CharT, formattable<_CharT>... _Args> struct _LIBCPP_TEMPLATE_VIS formatter, _CharT> : public __formatter_tuple<_CharT, tuple<_Args...>, _Args...> {}; -#endif //_LIBCPP_STD_VER >= 23 +#endif // _LIBCPP_STD_VER >= 23 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/indic_conjunct_break_table.h b/libcxx/include/__format/indic_conjunct_break_table.h index 44521d27498c3c..39dd45da771fc2 100644 --- a/libcxx/include/__format/indic_conjunct_break_table.h +++ b/libcxx/include/__format/indic_conjunct_break_table.h @@ -343,7 +343,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[201] = { } // namespace __indic_conjunct_break -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/parser_std_format_spec.h b/libcxx/include/__format/parser_std_format_spec.h index 150bdde89f3b39..28891e5d2876cd 100644 --- a/libcxx/include/__format/parser_std_format_spec.h +++ b/libcxx/include/__format/parser_std_format_spec.h @@ -1163,7 +1163,7 @@ __estimate_column_width(basic_string_view<_CharT> __str, size_t __maximum, __col } // namespace __format_spec -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/range_default_formatter.h b/libcxx/include/__format/range_default_formatter.h index b35223ae933291..fb21b0f8beb3a1 100644 --- a/libcxx/include/__format/range_default_formatter.h +++ b/libcxx/include/__format/range_default_formatter.h @@ -207,7 +207,7 @@ template requires(format_kind<_Rp> != range_format::disabled && formattable, _CharT>) struct _LIBCPP_TEMPLATE_VIS formatter<_Rp, _CharT> : __range_default_formatter, _Rp, _CharT> {}; -#endif //_LIBCPP_STD_VER >= 23 +#endif // _LIBCPP_STD_VER >= 23 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/range_formatter.h b/libcxx/include/__format/range_formatter.h index 69156307434937..def55c86ce51cd 100644 --- a/libcxx/include/__format/range_formatter.h +++ b/libcxx/include/__format/range_formatter.h @@ -257,7 +257,7 @@ struct _LIBCPP_TEMPLATE_VIS range_formatter { basic_string_view<_CharT> __closing_bracket_ = _LIBCPP_STATICALLY_WIDEN(_CharT, "]"); }; -#endif //_LIBCPP_STD_VER >= 23 +#endif // _LIBCPP_STD_VER >= 23 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/unicode.h b/libcxx/include/__format/unicode.h index de7d0fea1df56a..ce6d55ae346a3f 100644 --- a/libcxx/include/__format/unicode.h +++ b/libcxx/include/__format/unicode.h @@ -595,7 +595,7 @@ class __code_point_view { } // namespace __unicode -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/width_estimation_table.h b/libcxx/include/__format/width_estimation_table.h index 11f61dea18d696..23a08746b91031 100644 --- a/libcxx/include/__format/width_estimation_table.h +++ b/libcxx/include/__format/width_estimation_table.h @@ -263,7 +263,7 @@ inline constexpr uint32_t __table_upper_bound = 0x0003fffd; } // namespace __width_estimation_table -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__fwd/format.h b/libcxx/include/__fwd/format.h index b30c220f8a0435..815e3e1922c62d 100644 --- a/libcxx/include/__fwd/format.h +++ b/libcxx/include/__fwd/format.h @@ -31,7 +31,7 @@ class _LIBCPP_TEMPLATE_VIS basic_format_context; template struct _LIBCPP_TEMPLATE_VIS formatter; -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__memory/allocator.h b/libcxx/include/__memory/allocator.h index 0dbdc41d3c3d14..6a9eed926e05f4 100644 --- a/libcxx/include/__memory/allocator.h +++ b/libcxx/include/__memory/allocator.h @@ -47,7 +47,7 @@ class _LIBCPP_TEMPLATE_VIS allocator { typedef allocator<_Up> other; }; }; -#endif // _LIBCPP_STD_VER <= 17 +#endif // _LIBCPP_STD_VER <= 17 // This class provides a non-trivial default constructor to the class that derives from it // if the condition is satisfied. diff --git a/libcxx/include/__type_traits/is_member_pointer.h b/libcxx/include/__type_traits/is_member_pointer.h index cc125e318cf919..3e2753ac4228c2 100644 --- a/libcxx/include/__type_traits/is_member_pointer.h +++ b/libcxx/include/__type_traits/is_member_pointer.h @@ -27,7 +27,7 @@ struct _LIBCPP_TEMPLATE_VIS is_member_object_pointer : _BoolConstant<__is_member template struct _LIBCPP_TEMPLATE_VIS is_member_function_pointer : _BoolConstant<__is_member_function_pointer(_Tp)> {}; -# if _LIBCPP_STD_VER >= 17 +#if _LIBCPP_STD_VER >= 17 template inline constexpr bool is_member_pointer_v = __is_member_pointer(_Tp); @@ -36,7 +36,7 @@ inline constexpr bool is_member_object_pointer_v = __is_member_object_pointer(_T template inline constexpr bool is_member_function_pointer_v = __is_member_function_pointer(_Tp); -# endif +#endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_void.h b/libcxx/include/__type_traits/is_void.h index 46316b0d3a534e..562faae9fba2cd 100644 --- a/libcxx/include/__type_traits/is_void.h +++ b/libcxx/include/__type_traits/is_void.h @@ -21,10 +21,10 @@ _LIBCPP_BEGIN_NAMESPACE_STD template struct _LIBCPP_TEMPLATE_VIS is_void : _BoolConstant<__is_same(__remove_cv(_Tp), void)> {}; -# if _LIBCPP_STD_VER >= 17 +#if _LIBCPP_STD_VER >= 17 template inline constexpr bool is_void_v = __is_same(__remove_cv(_Tp), void); -# endif +#endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/array b/libcxx/include/array index 4db0cb7bd7e3b5..588664ace0162a 100644 --- a/libcxx/include/array +++ b/libcxx/include/array @@ -427,8 +427,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator>=(const array<_Tp, _Size>& __x, const template _LIBCPP_HIDE_FROM_ABI constexpr __synth_three_way_result<_Tp> operator<=>(const array<_Tp, _Size>& __x, const array<_Tp, _Size>& __y) { - return std::lexicographical_compare_three_way( - __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); + return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); } #endif // _LIBCPP_STD_VER <= 17 diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list index b8e3d05588f96e..6c0dc5f96a5d5e 100644 --- a/libcxx/include/forward_list +++ b/libcxx/include/forward_list @@ -1517,8 +1517,7 @@ operator<=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc> template _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp> operator<=>(const forward_list<_Tp, _Allocator>& __x, const forward_list<_Tp, _Allocator>& __y) { - return std::lexicographical_compare_three_way( - __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); + return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); } #endif // #if _LIBCPP_STD_VER <= 17 diff --git a/libcxx/include/iosfwd b/libcxx/include/iosfwd index 051c73995e98b4..eeafcc37c598ef 100644 --- a/libcxx/include/iosfwd +++ b/libcxx/include/iosfwd @@ -170,8 +170,8 @@ class __save_flags { _CharT __fill_; public: - __save_flags(const __save_flags&) = delete; - __save_flags& operator=(const __save_flags&) = delete; + __save_flags(const __save_flags&) = delete; + __save_flags& operator=(const __save_flags&) = delete; _LIBCPP_HIDE_FROM_ABI explicit __save_flags(__stream_type& __stream) : __stream_(__stream), __fmtflags_(__stream.flags()), __fill_(__stream.fill()) {} diff --git a/libcxx/include/list b/libcxx/include/list index 929c84de7be449..76b1d9241b41ca 100644 --- a/libcxx/include/list +++ b/libcxx/include/list @@ -466,7 +466,7 @@ public: template class __list_imp { public: - __list_imp(const __list_imp&) = delete; + __list_imp(const __list_imp&) = delete; __list_imp& operator=(const __list_imp&) = delete; typedef _Alloc allocator_type; @@ -1679,8 +1679,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const list<_Tp, _Alloc>& __x, const template _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp> operator<=>(const list<_Tp, _Allocator>& __x, const list<_Tp, _Allocator>& __y) { - return std::lexicographical_compare_three_way( - __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); + return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); } #endif // _LIBCPP_STD_VER <= 17 diff --git a/libcxx/include/set b/libcxx/include/set index 94533583798699..7e9661a0149ab9 100644 --- a/libcxx/include/set +++ b/libcxx/include/set @@ -1452,8 +1452,7 @@ operator<=(const multiset<_Key, _Compare, _Allocator>& __x, const multiset<_Key, template _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Key> operator<=>(const multiset<_Key, _Allocator>& __x, const multiset<_Key, _Allocator>& __y) { - return std::lexicographical_compare_three_way( - __x.begin(), __x.end(), __y.begin(), __y.end(), __synth_three_way); + return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), __synth_three_way); } #endif // _LIBCPP_STD_VER <= 17 diff --git a/libcxx/include/string b/libcxx/include/string index 45be4050304125..15c7a2f6b988b4 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -2014,11 +2014,11 @@ private: (void)__old_mid; (void)__new_mid; #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN) - #if defined(__APPLE__) +# if defined(__APPLE__) // TODO: remove after addressing issue #96099 (https://github.com/llvm/llvm-project/issues/96099) - if(!__is_long()) + if (!__is_long()) return; - #endif +# endif std::__annotate_contiguous_container<_Allocator>(data(), data() + capacity() + 1, __old_mid, __new_mid); #endif } diff --git a/libcxx/include/syncstream b/libcxx/include/syncstream index a0617f4acf5b6a..fea4c66b8e118f 100644 --- a/libcxx/include/syncstream +++ b/libcxx/include/syncstream @@ -255,11 +255,9 @@ public: // [syncstream.syncbuf.cons], construction and destruction - _LIBCPP_HIDE_FROM_ABI basic_syncbuf() - : basic_syncbuf(nullptr) {} + _LIBCPP_HIDE_FROM_ABI basic_syncbuf() : basic_syncbuf(nullptr) {} - _LIBCPP_HIDE_FROM_ABI explicit basic_syncbuf(streambuf_type* __obuf) - : basic_syncbuf(__obuf, _Allocator()) {} + _LIBCPP_HIDE_FROM_ABI explicit basic_syncbuf(streambuf_type* __obuf) : basic_syncbuf(__obuf, _Allocator()) {} _LIBCPP_HIDE_FROM_ABI basic_syncbuf(streambuf_type* __obuf, _Allocator const& __alloc) : __wrapped_(__obuf), __str_(__alloc) { diff --git a/libcxx/include/tuple b/libcxx/include/tuple index 081b90c7bbec54..5161c2aa97c2ba 100644 --- a/libcxx/include/tuple +++ b/libcxx/include/tuple @@ -833,8 +833,8 @@ public: // [tuple.assign] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& - operator=(_If<_And...>::value, tuple, __nat> const& __tuple) - noexcept(_And...>::value) { + operator=(_If<_And...>::value, tuple, __nat> const& __tuple) noexcept( + _And...>::value) { std::__memberwise_copy_assign(*this, __tuple, typename __make_tuple_indices::type()); return *this; } @@ -857,8 +857,8 @@ public: # endif // _LIBCPP_STD_VER >= 23 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& - operator=(_If<_And...>::value, tuple, __nat>&& __tuple) - noexcept(_And...>::value) { + operator=(_If<_And...>::value, tuple, __nat>&& __tuple) noexcept( + _And...>::value) { std::__memberwise_forward_assign( *this, std::move(__tuple), __tuple_types<_Tp...>(), typename __make_tuple_indices::type()); return *this; @@ -868,8 +868,8 @@ public: class... _Up, __enable_if_t< _And< _BoolConstant, is_assignable<_Tp&, _Up const&>... >::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(tuple<_Up...> const& __tuple) - noexcept(_And...>::value) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& + operator=(tuple<_Up...> const& __tuple) noexcept(_And...>::value) { std::__memberwise_copy_assign(*this, __tuple, typename __make_tuple_indices::type()); return *this; } @@ -877,8 +877,8 @@ public: template , is_assignable<_Tp&, _Up>... >::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(tuple<_Up...>&& __tuple) - noexcept(_And...>::value) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& + operator=(tuple<_Up...>&& __tuple) noexcept(_And...>::value) { std::__memberwise_forward_assign( *this, std::move(__tuple), __tuple_types<_Up...>(), typename __make_tuple_indices::type()); return *this; @@ -942,16 +942,16 @@ public: template const&>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(pair<_Up1, _Up2> const& __pair) - noexcept(_NothrowAssignFromPair const&>::value) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& + operator=(pair<_Up1, _Up2> const& __pair) noexcept(_NothrowAssignFromPair const&>::value) { std::get<0>(*this) = __pair.first; std::get<1>(*this) = __pair.second; return *this; } template &&>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(pair<_Up1, _Up2>&& __pair) - noexcept(_NothrowAssignFromPair&&>::value) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& + operator=(pair<_Up1, _Up2>&& __pair) noexcept(_NothrowAssignFromPair&&>::value) { std::get<0>(*this) = std::forward<_Up1>(__pair.first); std::get<1>(*this) = std::forward<_Up2>(__pair.second); return *this; @@ -962,8 +962,8 @@ public: class _Up, size_t _Np, __enable_if_t< _And< _BoolConstant<_Np == sizeof...(_Tp)>, is_assignable<_Tp&, _Up const&>... >::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(array<_Up, _Np> const& __array) - noexcept(_And...>::value) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& + operator=(array<_Up, _Np> const& __array) noexcept(_And...>::value) { std::__memberwise_copy_assign(*this, __array, typename __make_tuple_indices::type()); return *this; } @@ -973,8 +973,8 @@ public: size_t _Np, class = void, __enable_if_t< _And< _BoolConstant<_Np == sizeof...(_Tp)>, is_assignable<_Tp&, _Up>... >::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(array<_Up, _Np>&& __array) - noexcept(_And...>::value) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& + operator=(array<_Up, _Np>&& __array) noexcept(_And...>::value) { std::__memberwise_forward_assign( *this, std::move(__array), @@ -984,8 +984,8 @@ public: } // [tuple.swap] - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(tuple& __t) - noexcept(__all<__is_nothrow_swappable_v<_Tp>...>::value) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void + swap(tuple& __t) noexcept(__all<__is_nothrow_swappable_v<_Tp>...>::value) { __base_.swap(__t.__base_); } @@ -1043,8 +1043,8 @@ tuple(allocator_arg_t, _Alloc, tuple<_Tp...>) -> tuple<_Tp...>; # endif template ...>::value, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(tuple<_Tp...>& __t, tuple<_Tp...>& __u) - noexcept(__all<__is_nothrow_swappable_v<_Tp>...>::value) { +inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +swap(tuple<_Tp...>& __t, tuple<_Tp...>& __u) noexcept(__all<__is_nothrow_swappable_v<_Tp>...>::value) { __t.swap(__u); } diff --git a/libcxx/include/vector b/libcxx/include/vector index 81aab9407714cc..0f852e7f36c29c 100644 --- a/libcxx/include/vector +++ b/libcxx/include/vector @@ -2938,8 +2938,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const vector<_Tp, _Allocator>& __x, template _LIBCPP_HIDE_FROM_ABI constexpr __synth_three_way_result<_Tp> operator<=>(const vector<_Tp, _Allocator>& __x, const vector<_Tp, _Allocator>& __y) { - return std::lexicographical_compare_three_way( - __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); + return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); } #endif // _LIBCPP_STD_VER <= 17 diff --git a/libcxx/modules/std/format.inc b/libcxx/modules/std/format.inc index 09aa03ad73e388..8daf0de85cc412 100644 --- a/libcxx/modules/std/format.inc +++ b/libcxx/modules/std/format.inc @@ -30,7 +30,7 @@ export namespace std { #endif #if _LIBCPP_STD_VER >= 26 using std::runtime_format; -#endif //_LIBCPP_STD_VER >= 26 +#endif // _LIBCPP_STD_VER >= 26 // [format.functions], formatting functions using std::format; diff --git a/libcxx/src/include/refstring.h b/libcxx/src/include/refstring.h index 78452249f4fecf..3e0ec7a97c7bec 100644 --- a/libcxx/src/include/refstring.h +++ b/libcxx/src/include/refstring.h @@ -124,4 +124,4 @@ inline bool __libcpp_refstring::__uses_refcount() const { _LIBCPP_END_NAMESPACE_STD -#endif //_LIBCPP_REFSTRING_H +#endif // _LIBCPP_REFSTRING_H diff --git a/libcxx/utils/generate_escaped_output_table.py b/libcxx/utils/generate_escaped_output_table.py index 9dcecaa5575cdd..41524e8fe7186c 100755 --- a/libcxx/utils/generate_escaped_output_table.py +++ b/libcxx/utils/generate_escaped_output_table.py @@ -235,7 +235,7 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: // clang-format on }} // namespace __escaped_output_table -#endif //_LIBCPP_STD_VER >= 23 +#endif // _LIBCPP_STD_VER >= 23 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/utils/generate_extended_grapheme_cluster_table.py b/libcxx/utils/generate_extended_grapheme_cluster_table.py index 76d1e78e9239c6..558b606186130f 100755 --- a/libcxx/utils/generate_extended_grapheme_cluster_table.py +++ b/libcxx/utils/generate_extended_grapheme_cluster_table.py @@ -230,7 +230,7 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: {content} }} // namespace __extended_grapheme_custer_property_boundary -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/utils/generate_indic_conjunct_break_table.py b/libcxx/utils/generate_indic_conjunct_break_table.py index 762dfa73b51f7b..e41f6e9be233d7 100755 --- a/libcxx/utils/generate_indic_conjunct_break_table.py +++ b/libcxx/utils/generate_indic_conjunct_break_table.py @@ -223,7 +223,7 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: {content} }} // namespace __indic_conjunct_break -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/utils/generate_width_estimation_table.py b/libcxx/utils/generate_width_estimation_table.py index f4cce1071d1f15..d8c036f34e8353 100644 --- a/libcxx/utils/generate_width_estimation_table.py +++ b/libcxx/utils/generate_width_estimation_table.py @@ -261,7 +261,7 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: {content} }} // namespace __width_estimation_table -#endif //_LIBCPP_STD_VER >= 20 +#endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD From 68805de90280dc8d8df39ff3f6289033deb487cf Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 30 Aug 2024 08:57:38 -0700 Subject: [PATCH 58/98] [IVDesc] Reuse getBinOpIdentity in getRecurrenceIdentity [nfc] Avoid duplication so that we can easily tell these lists are in sync. --- llvm/lib/Analysis/IVDescriptors.cpp | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index f5258601fd5d49..ba3619417114c7 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -1040,28 +1040,13 @@ Value *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp, case RecurKind::Xor: case RecurKind::Add: case RecurKind::Or: - // Adding, Xoring, Oring zero to a number does not change it. - return ConstantInt::get(Tp, 0); case RecurKind::Mul: - // Multiplying a number by 1 does not change it. - return ConstantInt::get(Tp, 1); case RecurKind::And: - // AND-ing a number with an all-1 value does not change it. - return ConstantInt::get(Tp, -1, true); case RecurKind::FMul: - // Multiplying a number by 1 does not change it. - return ConstantFP::get(Tp, 1.0L); - case RecurKind::FMulAdd: case RecurKind::FAdd: - // Adding zero to a number does not change it. - // FIXME: Ideally we should not need to check FMF for FAdd and should always - // use -0.0. However, this will currently result in mixed vectors of 0.0/-0.0. - // Instead, we should ensure that 1) the FMF from FAdd are propagated to the PHI - // nodes where possible, and 2) PHIs with the nsz flag + -0.0 use 0.0. This would - // mean we can then remove the check for noSignedZeros() below (see D98963). - if (FMF.noSignedZeros()) - return ConstantFP::get(Tp, 0.0L); - return ConstantFP::get(Tp, -0.0L); + return ConstantExpr::getBinOpIdentity(getOpcode(K), Tp, false, FMF.noSignedZeros()); + case RecurKind::FMulAdd: + return ConstantExpr::getBinOpIdentity(Instruction::FAdd, Tp, false, FMF.noSignedZeros()); case RecurKind::UMin: return ConstantInt::get(Tp, -1, true); case RecurKind::UMax: From 941feb76c8186d2e237690511b48f57c6bda282b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 30 Aug 2024 17:17:11 +0100 Subject: [PATCH 59/98] [CostModel][X86] Fix SSE41/SSE42 cost checks on icmp tests Noticed on #106747 - some SSE41 tests didn't match the SSE2 baseline so we were missing ALL the checks :( --- .../Analysis/CostModel/X86/icmp-codesize.ll | 380 +++++++++++++++- .../Analysis/CostModel/X86/icmp-latency.ll | 416 ++++++++++++++++-- .../CostModel/X86/icmp-sizelatency.ll | 378 +++++++++++++++- llvm/test/Analysis/CostModel/X86/icmp.ll | 140 ++---- 4 files changed, 1165 insertions(+), 149 deletions(-) diff --git a/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll b/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll index 2dc6737a3d8a07..6c1bfb72c85967 100644 --- a/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse3 | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2 -; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE2 -; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s --check-prefixes=AVX1 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F @@ -11,8 +11,8 @@ ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2 ; -; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE42 -; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE4,SSE42 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE4,SSE42 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { @@ -39,6 +39,29 @@ define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x ; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_eq' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_eq' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 @@ -251,6 +274,29 @@ define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x ; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_ne' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_ne' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 @@ -463,6 +509,29 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_sge' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_sge' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 @@ -652,6 +721,52 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 } define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_uge' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_uge' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_uge' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 @@ -864,6 +979,29 @@ define i32 @cmp_int_sgt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_sgt' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_sgt' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 @@ -1076,6 +1214,29 @@ define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_ugt' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_ugt' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 @@ -1288,6 +1449,29 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_sle' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_sle' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 @@ -1477,6 +1661,52 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 } define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_ule' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_ule' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_ule' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 @@ -1689,6 +1919,29 @@ define i32 @cmp_int_slt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_slt' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_slt' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 @@ -1901,6 +2154,29 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_ult' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_ult' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 @@ -2090,6 +2366,52 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 } define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'scmp_int' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; +; SSE41-LABEL: 'scmp_int' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; AVX1-LABEL: 'scmp_int' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8) ; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) @@ -2256,6 +2578,52 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i } define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'ucmp_int' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; +; SSE41-LABEL: 'ucmp_int' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; AVX1-LABEL: 'ucmp_int' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8) ; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) @@ -2421,3 +2789,5 @@ define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i ret i32 undef } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SSE4: {{.*}} diff --git a/llvm/test/Analysis/CostModel/X86/icmp-latency.ll b/llvm/test/Analysis/CostModel/X86/icmp-latency.ll index 726a6dd782a4f7..efa903ea2819c8 100644 --- a/llvm/test/Analysis/CostModel/X86/icmp-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/icmp-latency.ll @@ -2,8 +2,8 @@ ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+sse3 | FileCheck %s --check-prefixes=SSE2 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2 -; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE2 -; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s --check-prefixes=AVX1 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F @@ -12,7 +12,7 @@ ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2 ; ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SLM -; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE4,SSE42 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { @@ -39,6 +39,29 @@ define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x ; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_eq' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_eq' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 @@ -274,6 +297,29 @@ define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x ; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_ne' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_ne' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 @@ -509,6 +555,29 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_sge' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_sge' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 @@ -721,6 +790,52 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 } define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_uge' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_uge' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_uge' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 @@ -956,6 +1071,29 @@ define i32 @cmp_int_sgt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_sgt' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_sgt' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 @@ -1191,6 +1329,29 @@ define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_ugt' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_ugt' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 @@ -1426,6 +1587,29 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_sle' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_sle' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 @@ -1638,6 +1822,52 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 } define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_ule' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_ule' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_ule' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 @@ -1873,6 +2103,29 @@ define i32 @cmp_int_slt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_slt' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_slt' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 @@ -2108,6 +2361,29 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_ult' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_ult' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 @@ -2320,28 +2596,51 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 } define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { -; SSE42-LABEL: 'scmp_int' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; SSE2-LABEL: 'scmp_int' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; +; SSE4-LABEL: 'scmp_int' +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'scmp_int' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8) @@ -2532,28 +2831,51 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i } define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { -; SSE42-LABEL: 'ucmp_int' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; SSE2-LABEL: 'ucmp_int' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; +; SSE4-LABEL: 'ucmp_int' +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'ucmp_int' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8) diff --git a/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll index 0deaad5991fb2f..4fc7c68be26f78 100644 --- a/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll @@ -2,8 +2,8 @@ ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse3 | FileCheck %s --check-prefixes=SSE2 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2 -; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE2 -; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s --check-prefixes=AVX1 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F @@ -11,8 +11,8 @@ ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2 ; -; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE42 -; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE4,SSE42 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE4,SSE42 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { @@ -39,6 +39,29 @@ define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x ; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_eq' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_eq' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 @@ -251,6 +274,29 @@ define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x ; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_ne' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_ne' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 @@ -463,6 +509,29 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_sge' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_sge' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 @@ -652,6 +721,52 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 } define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_uge' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_uge' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_uge' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 @@ -864,6 +979,29 @@ define i32 @cmp_int_sgt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_sgt' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_sgt' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 @@ -1076,6 +1214,29 @@ define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_ugt' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_ugt' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 @@ -1288,6 +1449,29 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_sle' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_sle' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 @@ -1477,6 +1661,52 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 } define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_ule' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_ule' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_ule' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 @@ -1689,6 +1919,29 @@ define i32 @cmp_int_slt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_slt' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_slt' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 @@ -1901,6 +2154,29 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSE41-LABEL: 'cmp_int_ult' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'cmp_int_ult' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 @@ -2090,6 +2366,52 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 } define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'scmp_int' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; +; SSE41-LABEL: 'scmp_int' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; AVX1-LABEL: 'scmp_int' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8) ; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) @@ -2256,6 +2578,52 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i } define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'ucmp_int' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; +; SSE41-LABEL: 'ucmp_int' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; AVX1-LABEL: 'ucmp_int' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8) ; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) @@ -2421,3 +2789,5 @@ define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i ret i32 undef } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SSE4: {{.*}} diff --git a/llvm/test/Analysis/CostModel/X86/icmp.ll b/llvm/test/Analysis/CostModel/X86/icmp.ll index 599895c2b5705a..d8959a67145d63 100644 --- a/llvm/test/Analysis/CostModel/X86/icmp.ll +++ b/llvm/test/Analysis/CostModel/X86/icmp.ll @@ -2,8 +2,8 @@ ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 -; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 -; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F @@ -12,7 +12,7 @@ ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+xop,+avx2 | FileCheck %s -check-prefixes=XOPAVX2 ; ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SLM -; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE4,SSE42 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { @@ -3125,51 +3125,28 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i ; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; SSE41-LABEL: 'scmp_int' -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8) -; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) -; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) -; SSE41-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) -; SSE41-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16) -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) -; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) -; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) -; SSE41-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32) -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) -; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) -; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64) -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) -; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSE42-LABEL: 'scmp_int' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-LABEL: 'scmp_int' +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'scmp_int' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8) @@ -3429,51 +3406,28 @@ define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i ; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; SSE41-LABEL: 'ucmp_int' -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8) -; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) -; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) -; SSE41-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) -; SSE41-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16) -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) -; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) -; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) -; SSE41-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32) -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) -; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) -; SSE41-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64) -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) -; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSE42-LABEL: 'ucmp_int' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE4-LABEL: 'ucmp_int' +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8) +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16) +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32) +; SSE4-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64) +; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'ucmp_int' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8) From 18e55052d6c7da765bbec311b1b6ac9590a2bfa3 Mon Sep 17 00:00:00 2001 From: Connie Zhu <60797237+connieyzhu@users.noreply.github.com> Date: Fri, 30 Aug 2024 09:21:13 -0700 Subject: [PATCH 60/98] [mlir][polly][llvm-lit] Fixed logic for turning on external shell in lit (#106458) For both mlir and polly, the lit internal shell is the default shell for running lit tests. However, if the user wanted to switch back to the external shell by setting `LIT_USE_INTERNAL_SHELL=0`, the `not` used in the body of the `if` conditional changes `use_lit_shell` to be True instead of the intended False. Removing `not` allows for this lit config to work as intended. Fixes https://github.com/llvm/llvm-project/issues/106459. --- mlir/test/lit.cfg.py | 5 ++++- polly/test/UnitIsl/lit.cfg | 5 ++++- polly/test/lit.cfg | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index 98d0ddd9a2be11..f0d4f35ba3e229 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -18,12 +18,15 @@ # name: The name of this test suite. config.name = "MLIR" +# TODO: Consolidate the logic for turning on the internal shell by default for all LLVM test suites. +# See https://github.com/llvm/llvm-project/issues/106636 for more details. +# # We prefer the lit internal shell which provides a better user experience on failures # unless the user explicitly disables it with LIT_USE_INTERNAL_SHELL=0 env var. use_lit_shell = True lit_shell_env = os.environ.get("LIT_USE_INTERNAL_SHELL") if lit_shell_env: - use_lit_shell = not lit.util.pythonize_bool(lit_shell_env) + use_lit_shell = lit.util.pythonize_bool(lit_shell_env) config.test_format = lit.formats.ShTest(execute_external=not use_lit_shell) diff --git a/polly/test/UnitIsl/lit.cfg b/polly/test/UnitIsl/lit.cfg index 0944d543572d86..4b68f1460c3d83 100644 --- a/polly/test/UnitIsl/lit.cfg +++ b/polly/test/UnitIsl/lit.cfg @@ -17,12 +17,15 @@ config.name = 'Polly - isl unit tests' # For now we require '&&' between commands, until they get globally killed and # the test runner updated. # +# TODO: Consolidate the logic for turning on the internal shell by default for all LLVM test suites. +# See https://github.com/llvm/llvm-project/issues/106636 for more details. +# # We prefer the lit internal shell which provides a better user experience on failures # unless the user explicitly disables it with LIT_USE_INTERNAL_SHELL=0 env var. use_lit_shell = True lit_shell_env = os.environ.get("LIT_USE_INTERNAL_SHELL") if lit_shell_env: - use_lit_shell = not lit.util.pythonize_bool(lit_shell_env) + use_lit_shell = lit.util.pythonize_bool(lit_shell_env) config.test_format = lit.formats.ShTest(execute_external=not use_lit_shell) diff --git a/polly/test/lit.cfg b/polly/test/lit.cfg index 156c1f97f5d3ae..075ebdacbdc946 100644 --- a/polly/test/lit.cfg +++ b/polly/test/lit.cfg @@ -20,12 +20,15 @@ config.name = 'Polly' # For now we require '&&' between commands, until they get globally killed and # the test runner updated. # +# TODO: Consolidate the logic for turning on the internal shell by default for all LLVM test suites. +# See https://github.com/llvm/llvm-project/issues/106636 for more details. +# # We prefer the lit internal shell which provides a better user experience on failures # unless the user explicitly disables it with LIT_USE_INTERNAL_SHELL=0 env var. use_lit_shell = True lit_shell_env = os.environ.get("LIT_USE_INTERNAL_SHELL") if lit_shell_env: - use_lit_shell = not lit.util.pythonize_bool(lit_shell_env) + use_lit_shell = lit.util.pythonize_bool(lit_shell_env) config.test_format = lit.formats.ShTest(execute_external=not use_lit_shell) From f1cf09104eddbbe81c75e112a85c4f8dc14d5035 Mon Sep 17 00:00:00 2001 From: Harini0924 Date: Fri, 30 Aug 2024 09:33:02 -0700 Subject: [PATCH 61/98] [compiler-rt][test] Added `env` command to fix NSAN_OPTIONS command not found error (#106676) Resolved the issue where `'NSAN_OPTIONS=check_nan=true,halt_on_error=0'` was not recognized as a command. Changed the test command to set the environment variable correctly using `env`. fixes: #106598 --- compiler-rt/test/nsan/vec_sqrt.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/compiler-rt/test/nsan/vec_sqrt.cpp b/compiler-rt/test/nsan/vec_sqrt.cpp index d1ef0487858506..64a7130322873c 100644 --- a/compiler-rt/test/nsan/vec_sqrt.cpp +++ b/compiler-rt/test/nsan/vec_sqrt.cpp @@ -1,7 +1,7 @@ // RUN: %clangxx_nsan -O0 -g -mavx %s -o %t -// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s +// RUN: env NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s // RUN: %clangxx_nsan -O3 -g -mavx %s -o %t -// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s +// RUN: env NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s #include #include @@ -31,4 +31,4 @@ int main() { // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected } return 0; -} \ No newline at end of file +} From f81f283b365f99e1a71a836381c36874e965d80e Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Fri, 30 Aug 2024 09:38:44 -0700 Subject: [PATCH 62/98] Revert "Reapply "[HWASan] remove incorrectly inferred attributes" (#106622)" (#106758) Reverts llvm/llvm-project#106624 caused timeouts --- .../CodeGen/address-safety-attr-flavors.cpp | 28 +- .../Instrumentation/HWAddressSanitizer.cpp | 26 +- .../HWAddressSanitizer/RISCV/alloca.ll | 156 +++++----- .../HWAddressSanitizer/RISCV/basic.ll | 270 +++++++++--------- .../HWAddressSanitizer/alloca.ll | 160 +++++------ .../HWAddressSanitizer/attrinfer.ll | 14 - .../HWAddressSanitizer/basic.ll | 208 +++++++------- .../HWAddressSanitizer/fixed-shadow.ll | 4 +- .../hwasan-pass-second-run.ll | 4 +- .../HWAddressSanitizer/mem-attr.ll | 2 +- 10 files changed, 421 insertions(+), 451 deletions(-) delete mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll diff --git a/clang/test/CodeGen/address-safety-attr-flavors.cpp b/clang/test/CodeGen/address-safety-attr-flavors.cpp index 04d540d471dc8f..ef815555059db8 100644 --- a/clang/test/CodeGen/address-safety-attr-flavors.cpp +++ b/clang/test/CodeGen/address-safety-attr-flavors.cpp @@ -28,8 +28,8 @@ int HasSanitizeAddress() { return 1; } // CHECK-NOASAN: {{Function Attrs: mustprogress noinline nounwind$}} // CHECK-ASAN: Function Attrs: mustprogress noinline nounwind sanitize_address // CHECK-KASAN: Function Attrs: mustprogress noinline nounwind sanitize_address -// CHECK-HWASAN: Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress -// CHECK-KHWASAN: Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress +// CHECK-HWASAN: Function Attrs: mustprogress noinline nounwind sanitize_hwaddress +// CHECK-KHWASAN: Function Attrs: mustprogress noinline nounwind sanitize_hwaddress __attribute__((no_sanitize("address"))) int NoSanitizeQuoteAddress() { return 0; @@ -37,15 +37,15 @@ __attribute__((no_sanitize("address"))) int NoSanitizeQuoteAddress() { // CHECK-NOASAN: {{Function Attrs: mustprogress noinline nounwind$}} // CHECK-ASAN: {{Function Attrs: mustprogress noinline nounwind$}} // CHECK-KASAN: {{Function Attrs: mustprogress noinline nounwind$}} -// CHECK-HWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress$}} -// CHECK-KHWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress$}} +// CHECK-HWASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_hwaddress$}} +// CHECK-KHWASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_hwaddress$}} __attribute__((no_sanitize_address)) int NoSanitizeAddress() { return 0; } // CHECK-NOASAN: {{Function Attrs: mustprogress noinline nounwind$}} // CHECK-ASAN: {{Function Attrs: mustprogress noinline nounwind$}} // CHECK-KASAN: {{Function Attrs: mustprogress noinline nounwind$}} -// CHECK-HWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress$}} -// CHECK-KHWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress$}} +// CHECK-HWASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_hwaddress$}} +// CHECK-KHWASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_hwaddress$}} __attribute__((no_sanitize("kernel-address"))) int NoSanitizeKernelAddress() { return 0; @@ -53,8 +53,8 @@ __attribute__((no_sanitize("kernel-address"))) int NoSanitizeKernelAddress() { // CHECK-NOASAN: {{Function Attrs: mustprogress noinline nounwind$}} // CHECK-ASAN: {{Function Attrs: mustprogress noinline nounwind$}} // CHECK-KASAN: {{Function Attrs: mustprogress noinline nounwind$}} -// CHECK-HWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress$}} -// CHECK-KHWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress$}} +// CHECK-HWASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_hwaddress$}} +// CHECK-KHWASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_hwaddress$}} __attribute__((no_sanitize("hwaddress"))) int NoSanitizeHWAddress() { return 0; @@ -62,8 +62,8 @@ __attribute__((no_sanitize("hwaddress"))) int NoSanitizeHWAddress() { // CHECK-NOASAN: {{Function Attrs: mustprogress noinline nounwind$}} // CHECK-ASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_address$}} // CHECK-KASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_address$}} -// CHECK-HWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind$}} -// CHECK-KHWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind$}} +// CHECK-HWASAN: {{Function Attrs: mustprogress noinline nounwind$}} +// CHECK-KHWASAN: {{Function Attrs: mustprogress noinline nounwind$}} __attribute__((no_sanitize("kernel-hwaddress"))) int NoSanitizeKernelHWAddress() { return 0; @@ -71,8 +71,8 @@ __attribute__((no_sanitize("kernel-hwaddress"))) int NoSanitizeKernelHWAddress() // CHECK-NOASAN: {{Function Attrs: mustprogress noinline nounwind$}} // CHECK-ASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_address$}} // CHECK-KASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_address$}} -// CHECK-HWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind$}} -// CHECK-KHWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind$}} +// CHECK-HWASAN: {{Function Attrs: mustprogress noinline nounwind$}} +// CHECK-KHWASAN: {{Function Attrs: mustprogress noinline nounwind$}} __attribute__((disable_sanitizer_instrumentation)) int DisableSanitizerInstrumentation() { return 0; @@ -80,5 +80,5 @@ __attribute__((disable_sanitizer_instrumentation)) int DisableSanitizerInstrumen // CHECK-NOASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress noinline nounwind$}} // CHECK-ASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress noinline nounwind$}} // CHECK-KASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress noinline nounwind$}} -// CHECK-HWASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress nobuiltin noinline nounwind$}} -// CHECK-KHWASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress nobuiltin noinline nounwind$}} +// CHECK-HWASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress noinline nounwind$}} +// CHECK-KHWASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress noinline nounwind$}} diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 36f5cd7fd9e6cb..69e5835bee8a5e 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -598,24 +598,6 @@ void HWAddressSanitizer::initializeModule() { LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n"); TargetTriple = Triple(M.getTargetTriple()); - for (auto &F : M.functions()) { - // Remove memory attributes that are invalid with HWASan. - // HWASan checks read from shadow, which invalidates memory(argmem: *) - // Short granule checks on function arguments read from the argument memory - // (last byte of the granule), which invalidates writeonly. - // - // This is not only true for sanitized functions, because AttrInfer can - // infer those attributes on libc functions, which is not true if those - // are instrumented (Android) or intercepted. - - // nobuiltin makes sure later passes don't restore assumptions about - // the function. - F.addFnAttr(llvm::Attribute::NoBuiltin); - F.removeFnAttr(llvm::Attribute::Memory); - for (auto &A : F.args()) - A.removeAttr(llvm::Attribute::WriteOnly); - } - // x86_64 currently has two modes: // - Intel LAM (default) // - pointer aliasing (heap only) @@ -1640,6 +1622,14 @@ void HWAddressSanitizer::sanitizeFunction(Function &F, assert(!ShadowBase); + // Remove memory attributes that are about to become invalid. + // HWASan checks read from shadow, which invalidates memory(argmem: *) + // Short granule checks on function arguments read from the argument memory + // (last byte of the granule), which invalidates writeonly. + F.removeFnAttr(llvm::Attribute::Memory); + for (auto &A : F.args()) + A.removeAttr(llvm::Attribute::WriteOnly); + BasicBlock::iterator InsertPt = F.getEntryBlock().begin(); IRBuilder<> EntryIRB(&F.getEntryBlock(), InsertPt); emitPrologue(EntryIRB, diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll index 032168e28421b9..23b1043c700165 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll @@ -33,7 +33,7 @@ declare void @use32(ptr) ;. define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-LABEL: define void @test_alloca -; DYNAMIC-SHADOW-SAME: () #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { +; DYNAMIC-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { ; DYNAMIC-SHADOW-NEXT: entry: ; DYNAMIC-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow) ; DYNAMIC-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -42,33 +42,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; DYNAMIC-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; DYNAMIC-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) -; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) +; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG14]] ; ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca -; ZERO-BASED-SHADOW-SAME: () #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { +; ZERO-BASED-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { ; ZERO-BASED-SHADOW-NEXT: entry: ; ZERO-BASED-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null) ; ZERO-BASED-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -77,30 +77,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; ZERO-BASED-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) -; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) +; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG14]] ; entry: %x = alloca i32, align 4 @@ -131,17 +131,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) !23 = !DILocation(line: 7, column: 5, scope: !15) !24 = !DILocation(line: 8, column: 1, scope: !15) ;. -; DYNAMIC-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin } -; DYNAMIC-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress } -; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind } -; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -; DYNAMIC-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; DYNAMIC-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress } +; DYNAMIC-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind } +; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } ;. -; ZERO-BASED-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin } -; ZERO-BASED-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress } -; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind } -; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -; ZERO-BASED-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; ZERO-BASED-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress } +; ZERO-BASED-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind } +; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } ;. ; DYNAMIC-SHADOW: [[META0]] = !{ptr @hwasan.note} ; DYNAMIC-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None) @@ -149,16 +147,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; DYNAMIC-SHADOW: [[META3]] = !{} ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} -; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) -; DYNAMIC-SHADOW: [[META10]] = !{null} -; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) -; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) -; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) -; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) +; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) +; DYNAMIC-SHADOW: [[META9]] = !{null} +; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) +; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) +; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) +; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) ;. ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note} ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None) @@ -166,14 +163,13 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; ZERO-BASED-SHADOW: [[META3]] = !{} ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} -; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META10]] = !{null} -; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) -; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) -; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) +; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META9]] = !{null} +; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) +; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) +; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) ;. diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll index dc2d11cb4b3538..9cebe2e845f772 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll @@ -9,6 +9,8 @@ ; RUN: opt < %s -passes=hwasan -hwasan-recover=0 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=ABORT-ZERO-BASED-SHADOW ; RUN: opt < %s -passes=hwasan -hwasan-recover=1 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=RECOVER-ZERO-BASED-SHADOW +; CHECK: @llvm.used = appending global [1 x ptr] [ptr @hwasan.module_ctor] +; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @hwasan.module_ctor, ptr @hwasan.module_ctor }] target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "riscv64-unknown-linux" @@ -30,7 +32,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0) ; CHECK-NEXT: br label [[TMP13]] @@ -66,7 +68,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0) ; FASTPATH-NEXT: br label [[TMP13]] @@ -86,7 +88,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -106,10 +108,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -118,13 +120,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -143,7 +145,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -163,10 +165,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -175,13 +177,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -210,7 +212,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1) ; CHECK-NEXT: br label [[TMP13]] @@ -246,7 +248,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1) ; FASTPATH-NEXT: br label [[TMP13]] @@ -266,7 +268,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -286,10 +288,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -298,13 +300,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -323,7 +325,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -343,10 +345,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -355,13 +357,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -390,7 +392,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2) ; CHECK-NEXT: br label [[TMP13]] @@ -426,7 +428,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2) ; FASTPATH-NEXT: br label [[TMP13]] @@ -446,7 +448,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -466,10 +468,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -478,13 +480,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -503,7 +505,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -523,10 +525,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -535,13 +537,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -570,7 +572,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3) ; CHECK-NEXT: br label [[TMP13]] @@ -606,7 +608,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3) ; FASTPATH-NEXT: br label [[TMP13]] @@ -626,7 +628,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -646,10 +648,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -658,13 +660,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -683,7 +685,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -703,10 +705,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -715,13 +717,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -750,7 +752,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4) ; CHECK-NEXT: br label [[TMP13]] @@ -786,7 +788,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4) ; FASTPATH-NEXT: br label [[TMP13]] @@ -806,7 +808,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -826,10 +828,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -838,13 +840,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -863,7 +865,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -883,10 +885,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -895,13 +897,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1011,7 +1013,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16) ; CHECK-NEXT: br label [[TMP13]] @@ -1047,7 +1049,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1067,7 +1069,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1087,10 +1089,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1099,13 +1101,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1124,7 +1126,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1144,10 +1146,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1156,13 +1158,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1191,7 +1193,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17) ; CHECK-NEXT: br label [[TMP13]] @@ -1227,7 +1229,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1247,7 +1249,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1267,10 +1269,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1279,13 +1281,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1304,7 +1306,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1324,10 +1326,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1336,13 +1338,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1371,7 +1373,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18) ; CHECK-NEXT: br label [[TMP13]] @@ -1407,7 +1409,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1427,7 +1429,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1447,10 +1449,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1459,13 +1461,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1484,7 +1486,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1504,10 +1506,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1516,13 +1518,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1551,7 +1553,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19) ; CHECK-NEXT: br label [[TMP13]] @@ -1587,7 +1589,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1607,7 +1609,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1627,10 +1629,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1639,13 +1641,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1664,7 +1666,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1684,10 +1686,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1696,13 +1698,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1731,7 +1733,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; CHECK: 12: ; CHECK-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20) ; CHECK-NEXT: br label [[TMP13]] @@ -1767,7 +1769,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]] ; FASTPATH-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 ; FASTPATH-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]] -; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]] ; FASTPATH: 12: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20) ; FASTPATH-NEXT: br label [[TMP13]] @@ -1787,7 +1789,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-DYNAMIC-SHADOW: 8: ; ABORT-DYNAMIC-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20) ; ABORT-DYNAMIC-SHADOW-NEXT: br label [[TMP9]] @@ -1807,10 +1809,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1819,13 +1821,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1844,7 +1846,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; ABORT-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; ABORT-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; ABORT-ZERO-BASED-SHADOW: 8: ; ABORT-ZERO-BASED-SHADOW-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20) ; ABORT-ZERO-BASED-SHADOW-NEXT: br label [[TMP9]] @@ -1864,10 +1866,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1876,13 +1878,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -2058,43 +2060,43 @@ entry: define i8 @test_load_noattr(ptr %a) { ; CHECK-LABEL: define i8 @test_load_noattr -; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: (ptr [[A:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] ; ; NOFASTPATH-LABEL: define i8 @test_load_noattr -; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; NOFASTPATH-SAME: (ptr [[A:%.*]]) { ; NOFASTPATH-NEXT: entry: ; NOFASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; NOFASTPATH-NEXT: ret i8 [[B]] ; ; FASTPATH-LABEL: define i8 @test_load_noattr -; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; FASTPATH-SAME: (ptr [[A:%.*]]) { ; FASTPATH-NEXT: entry: ; FASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; FASTPATH-NEXT: ret i8 [[B]] ; ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr -; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) { ; ABORT-DYNAMIC-SHADOW-NEXT: entry: ; ABORT-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr -; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) { ; RECOVER-DYNAMIC-SHADOW-NEXT: entry: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr -; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) { ; ABORT-ZERO-BASED-SHADOW-NEXT: entry: ; ABORT-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr -; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) { ; RECOVER-ZERO-BASED-SHADOW-NEXT: entry: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] @@ -2106,43 +2108,43 @@ entry: define i8 @test_load_notmyattr(ptr %a) sanitize_address { ; CHECK-LABEL: define i8 @test_load_notmyattr -; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] ; ; NOFASTPATH-LABEL: define i8 @test_load_notmyattr -; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; NOFASTPATH-NEXT: entry: ; NOFASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; NOFASTPATH-NEXT: ret i8 [[B]] ; ; FASTPATH-LABEL: define i8 @test_load_notmyattr -; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; FASTPATH-NEXT: entry: ; FASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; FASTPATH-NEXT: ret i8 [[B]] ; ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr -; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; ABORT-DYNAMIC-SHADOW-NEXT: entry: ; ABORT-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr -; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; RECOVER-DYNAMIC-SHADOW-NEXT: entry: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr -; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; ABORT-ZERO-BASED-SHADOW-NEXT: entry: ; ABORT-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr -; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; RECOVER-ZERO-BASED-SHADOW-NEXT: entry: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll index 0f74736dc232ea..4bd23ea76c159b 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll @@ -34,7 +34,7 @@ declare void @use32(ptr) ;. define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-LABEL: define void @test_alloca( -; DYNAMIC-SHADOW-SAME: ) #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { +; DYNAMIC-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { ; DYNAMIC-SHADOW-NEXT: entry: ; DYNAMIC-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow) ; DYNAMIC-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -43,33 +43,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; DYNAMIC-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; DYNAMIC-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; DYNAMIC-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) -; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] -; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] -; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] -; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG15]] +; DYNAMIC-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) +; DYNAMIC-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] +; DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] +; DYNAMIC-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] +; DYNAMIC-SHADOW-NEXT: ret void, !dbg [[DBG14]] ; ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca( -; ZERO-BASED-SHADOW-SAME: ) #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] { +; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] { ; ZERO-BASED-SHADOW-NEXT: entry: ; ZERO-BASED-SHADOW-NEXT: [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null) ; ZERO-BASED-SHADOW-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) @@ -78,30 +78,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 { ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]] ; ZERO-BASED-SHADOW-NEXT: [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56 ; ZERO-BASED-SHADOW-NEXT: [[X:%.*]] = alloca { i32, [12 x i8] }, align 16 -; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]]) -; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]] -; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]] -; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]] -; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG15]] +; ZERO-BASED-SHADOW-NEXT: #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]]) +; ZERO-BASED-SHADOW-NEXT: [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]] +; ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]] +; ZERO-BASED-SHADOW-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]] +; ZERO-BASED-SHADOW-NEXT: ret void, !dbg [[DBG14]] ; entry: %x = alloca i32, align 4 @@ -112,13 +112,13 @@ entry: define void @test_vscale_alloca() sanitize_hwaddress { ; DYNAMIC-SHADOW-LABEL: define void @test_vscale_alloca( -; DYNAMIC-SHADOW-SAME: ) #[[ATTR1]] { +; DYNAMIC-SHADOW-SAME: ) #[[ATTR0]] { ; DYNAMIC-SHADOW-NEXT: [[X:%.*]] = alloca , align 32 ; DYNAMIC-SHADOW-NEXT: call void @use32(ptr nonnull [[X]]) ; DYNAMIC-SHADOW-NEXT: ret void ; ; ZERO-BASED-SHADOW-LABEL: define void @test_vscale_alloca( -; ZERO-BASED-SHADOW-SAME: ) #[[ATTR1]] { +; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0]] { ; ZERO-BASED-SHADOW-NEXT: [[X:%.*]] = alloca , align 32 ; ZERO-BASED-SHADOW-NEXT: call void @use32(ptr nonnull [[X]]) ; ZERO-BASED-SHADOW-NEXT: ret void @@ -150,17 +150,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) !23 = !DILocation(line: 7, column: 5, scope: !15) !24 = !DILocation(line: 8, column: 1, scope: !15) ;. -; DYNAMIC-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin } -; DYNAMIC-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress } -; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind } -; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -; DYNAMIC-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; DYNAMIC-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress } +; DYNAMIC-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind } +; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } ;. -; ZERO-BASED-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin } -; ZERO-BASED-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress } -; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind } -; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -; ZERO-BASED-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; ZERO-BASED-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress } +; ZERO-BASED-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind } +; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } ;. ; DYNAMIC-SHADOW: [[META0]] = !{ptr @hwasan.note} ; DYNAMIC-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None) @@ -168,16 +166,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; DYNAMIC-SHADOW: [[META3]] = !{} ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} -; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) -; DYNAMIC-SHADOW: [[META10]] = !{null} -; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) -; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) -; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) -; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) +; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) +; DYNAMIC-SHADOW: [[META9]] = !{null} +; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) +; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) +; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) +; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) ;. ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note} ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None) @@ -185,14 +182,13 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; ZERO-BASED-SHADOW: [[META3]] = !{} ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} -; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1} -; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) -; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META10]] = !{null} -; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]]) -; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]]) -; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]]) -; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]]) +; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]]) +; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META9]] = !{null} +; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]]) +; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]]) +; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]]) +; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]]) ;. diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll b/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll deleted file mode 100644 index eeb51aeda1000b..00000000000000 --- a/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll +++ /dev/null @@ -1,14 +0,0 @@ -; Standard library functions get inferred attributes, some of which are not -; correct when building for HWASan. - -; RUN: opt < %s -passes=hwasan -S | FileCheck %s --check-prefixes=CHECK - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64--linux-android10000" - -declare float @frexpf(float noundef, ptr nocapture noundef) local_unnamed_addr #0 - -attributes #0 = { mustprogress nofree nounwind willreturn memory(argmem: write) "frame-pointer"="non-leaf" "hwasan-abi"="interceptor" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fix-cortex-a53-835769,+fp-armv8,+neon,+outline-atomics,+tagged-globals,+v8a" } - -; CHECK-NOT: memory(argmem: write) -; CHECK: nobuiltin diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll index 1e74f2891a2e3c..4212293f42545e 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll @@ -42,7 +42,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0) ; FASTPATH-NEXT: br label [[TMP9]] @@ -70,10 +70,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -82,13 +82,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -115,10 +115,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -127,13 +127,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -174,7 +174,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1) ; FASTPATH-NEXT: br label [[TMP9]] @@ -202,10 +202,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -214,13 +214,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -247,10 +247,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -259,13 +259,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -306,7 +306,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2) ; FASTPATH-NEXT: br label [[TMP9]] @@ -334,10 +334,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -346,13 +346,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -379,10 +379,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -391,13 +391,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -438,7 +438,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3) ; FASTPATH-NEXT: br label [[TMP9]] @@ -466,10 +466,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -478,13 +478,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -511,10 +511,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -523,13 +523,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -570,7 +570,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4) ; FASTPATH-NEXT: br label [[TMP9]] @@ -598,10 +598,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -610,13 +610,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -643,10 +643,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -655,13 +655,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -771,7 +771,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16) ; FASTPATH-NEXT: br label [[TMP9]] @@ -799,10 +799,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -811,13 +811,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -844,10 +844,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -856,13 +856,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 0 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -903,7 +903,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17) ; FASTPATH-NEXT: br label [[TMP9]] @@ -931,10 +931,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -943,13 +943,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -976,10 +976,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -988,13 +988,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1035,7 +1035,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18) ; FASTPATH-NEXT: br label [[TMP9]] @@ -1063,10 +1063,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1075,13 +1075,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1108,10 +1108,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1120,13 +1120,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 3 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1167,7 +1167,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19) ; FASTPATH-NEXT: br label [[TMP9]] @@ -1195,10 +1195,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1207,13 +1207,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1240,10 +1240,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1252,13 +1252,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 7 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1299,7 +1299,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; FASTPATH-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; FASTPATH-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; FASTPATH-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]] +; FASTPATH-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] ; FASTPATH: 8: ; FASTPATH-NEXT: call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20) ; FASTPATH-NEXT: br label [[TMP9]] @@ -1327,10 +1327,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]] ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 8: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 10: ; RECOVER-DYNAMIC-SHADOW-NEXT: call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]]) ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1339,13 +1339,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 16: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-DYNAMIC-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-DYNAMIC-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-DYNAMIC-SHADOW: 21: ; RECOVER-DYNAMIC-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-DYNAMIC-SHADOW: 22: @@ -1372,10 +1372,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 8: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15 -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 10: ; RECOVER-ZERO-BASED-SHADOW-NEXT: call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]]) ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP21:%.*]] @@ -1384,13 +1384,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress { ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 16: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP17:%.*]] = or i64 [[TMP3]], 15 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]] -; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]] +; RECOVER-ZERO-BASED-SHADOW-NEXT: br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]] ; RECOVER-ZERO-BASED-SHADOW: 21: ; RECOVER-ZERO-BASED-SHADOW-NEXT: br label [[TMP22]] ; RECOVER-ZERO-BASED-SHADOW: 22: @@ -1542,43 +1542,43 @@ entry: define i8 @test_load_noattr(ptr %a) { ; CHECK-LABEL: define i8 @test_load_noattr -; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: (ptr [[A:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] ; ; NOFASTPATH-LABEL: define i8 @test_load_noattr -; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; NOFASTPATH-SAME: (ptr [[A:%.*]]) { ; NOFASTPATH-NEXT: entry: ; NOFASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; NOFASTPATH-NEXT: ret i8 [[B]] ; ; FASTPATH-LABEL: define i8 @test_load_noattr -; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; FASTPATH-SAME: (ptr [[A:%.*]]) { ; FASTPATH-NEXT: entry: ; FASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; FASTPATH-NEXT: ret i8 [[B]] ; ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr -; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) { ; ABORT-DYNAMIC-SHADOW-NEXT: entry: ; ABORT-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr -; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) { ; RECOVER-DYNAMIC-SHADOW-NEXT: entry: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr -; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) { ; ABORT-ZERO-BASED-SHADOW-NEXT: entry: ; ABORT-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr -; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) { ; RECOVER-ZERO-BASED-SHADOW-NEXT: entry: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] @@ -1590,43 +1590,43 @@ entry: define i8 @test_load_notmyattr(ptr %a) sanitize_address { ; CHECK-LABEL: define i8 @test_load_notmyattr -; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] ; ; NOFASTPATH-LABEL: define i8 @test_load_notmyattr -; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; NOFASTPATH-NEXT: entry: ; NOFASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; NOFASTPATH-NEXT: ret i8 [[B]] ; ; FASTPATH-LABEL: define i8 @test_load_notmyattr -; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; FASTPATH-NEXT: entry: ; FASTPATH-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; FASTPATH-NEXT: ret i8 [[B]] ; ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr -; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; ABORT-DYNAMIC-SHADOW-NEXT: entry: ; ABORT-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr -; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; RECOVER-DYNAMIC-SHADOW-NEXT: entry: ; RECOVER-DYNAMIC-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-DYNAMIC-SHADOW-NEXT: ret i8 [[B]] ; ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr -; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; ABORT-ZERO-BASED-SHADOW-NEXT: entry: ; ABORT-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; ABORT-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] ; ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr -; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; RECOVER-ZERO-BASED-SHADOW-NEXT: entry: ; RECOVER-ZERO-BASED-SHADOW-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; RECOVER-ZERO-BASED-SHADOW-NEXT: ret i8 [[B]] diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll b/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll index f72fc0a9720e4a..980189c5607f31 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll @@ -194,7 +194,7 @@ entry: define i8 @test_load_noattr(ptr %a) { ; CHECK-LABEL: define i8 @test_load_noattr -; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: (ptr [[A:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] @@ -206,7 +206,7 @@ entry: define i8 @test_load_notmyattr(ptr %a) sanitize_address { ; CHECK-LABEL: define i8 @test_load_notmyattr -; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 4 ; CHECK-NEXT: ret i8 [[B]] diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll b/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll index 2635dfb75ed98f..00614b603fe799 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll @@ -18,7 +18,7 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: @__hwasan_shadow = external global [0 x i8] ;. define i8 @test_load8(ptr %a) sanitize_hwaddress { -; CHECK: Function Attrs: nobuiltin sanitize_hwaddress +; CHECK: Function Attrs: sanitize_hwaddress ; CHECK-LABEL: define i8 @test_load8 ; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: @@ -33,7 +33,7 @@ entry: ret i8 %b } ;. -; CHECK: attributes #[[ATTR0]] = { nobuiltin sanitize_hwaddress } +; CHECK: attributes #[[ATTR0]] = { sanitize_hwaddress } ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind } ;. ; CHECK: [[META0]] = !{ptr @hwasan.note} diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll index 919eacb2951f5e..c0e370f20213aa 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll @@ -11,5 +11,5 @@ entry: ret void } -; CHECK: attributes #0 = { nobuiltin sanitize_hwaddress uwtable } +; CHECK: attributes #0 = { sanitize_hwaddress uwtable } attributes #0 = { sanitize_hwaddress memory(argmem: write) uwtable } From 9764cf888502fe6dd15ab21de5c2f73cae47a2c0 Mon Sep 17 00:00:00 2001 From: Harini0924 Date: Fri, 30 Aug 2024 09:39:35 -0700 Subject: [PATCH 63/98] [llvm-lit] Add precommit test to verify current behavior of glob expansion in lit's internal shell (#106325) This patch introduces a precommit test to verify the current behavior of glob expansion in lit's internal shell. The motivation for this test stems from an issue encountered during the BOLT test suite when running with the lit internal shell using the command: `LIT_USE_INTERNAL_SHELL=1 ninja check-bolt` During execution, the following error was observed: ``` File "/usr/local/google/home/harinidonthula/llvm-project/llvm/utils/lit/lit/TestRunner.py", line 416, in executeBuiltinEcho stdout.write(encode(maybeUnescape(args[-1]))) TypeError: string argument expected, got 'GlobItem' ``` The `executeBuiltinEcho` function in the lit testing framework expects a string to be passed to `stdout.write`, but it received a `GlobItem` object instead. This precommit test is designed to check the current behavior where the glob pattern isn't correctly expanded, leading to this `TypeError`. While this patch doesn't fix the issue, it helps in understanding and verifying the current behavior. The feedback I received from this [PR](https://github.com/llvm/llvm-project/pull/105925) suggests using `cmd.args = expand_glob_expressions(cmd.args, shenv.cwd)` to match the behavior of `executeBuiltinMkdir` and `executeBuiltinRm`, but it is recognized that the internal shell should ideally expand globs before calling any built-in command. **Request for Feedback:** I'm looking for feedback on how to improve this precommit test, specifically regarding the handling and expansion of glob patterns for commands like mkdir and rm within the internal shell. Currently, the args are expanded at the beginning of these functions, which should ensure proper glob expansion. However, I'd appreciate guidance on whether I should write additional tests to verify that mkdir and rm are handling glob expansions correctly. If such tests are recommended, I would also appreciate advice on the best approach to implement them, considering the existing framework and the way glob expansion is expected to function in the internal shell. Should these tests confirm that the current implementation passes, or are there specific edge cases I should be aware of? **Next Steps:** In my follow-up PR, I plan to address the UNRESOLVED error by expanding the entire command, ensuring correct and consistent behavior across all commands. The current test checks for an unresolved issue with the glob expansion, specifically looking for a `TypeError` due to an unexpanded `GlobItem`. This will be updated to reflect the correct behavior once the issue is resolved. This change is relevant for [[RFC] Enabling the Lit Internal Shell by Default](https://discourse.llvm.org/t/rfc-enabling-the-lit-internal-shell-by-default/80179/3) --- .../lit/tests/Inputs/shtest-glob/example_file1.input | 2 ++ .../lit/tests/Inputs/shtest-glob/example_file2.input | 2 ++ .../utils/lit/tests/Inputs/shtest-glob/glob-echo.txt | 2 ++ .../lit/tests/Inputs/shtest-glob/glob-mkdir.txt | 2 ++ llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg | 8 ++++++++ llvm/utils/lit/tests/shtest-glob.py | 12 ++++++++++++ 6 files changed, 28 insertions(+) create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg create mode 100644 llvm/utils/lit/tests/shtest-glob.py diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input b/llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input new file mode 100644 index 00000000000000..0987c9081ca1f3 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input @@ -0,0 +1,2 @@ +## This is the first example file used for testing glob pattern matching. +This is the first example file. diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input b/llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input new file mode 100644 index 00000000000000..f1a843f308262e --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input @@ -0,0 +1,2 @@ +## This is the second example file used for testing glob pattern matching. +This is the second example file. diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt new file mode 100644 index 00000000000000..b69f5e74fd7281 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt @@ -0,0 +1,2 @@ +## Tests glob pattern expansion by listing matching files. +# RUN: echo %S/example_file*.input diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt new file mode 100644 index 00000000000000..d1329f5dbfaaed --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt @@ -0,0 +1,2 @@ +## Tests glob pattern handling in the mkdir command. +# RUN: not mkdir %S/example_file*.input diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg new file mode 100644 index 00000000000000..4e5f4cac4c4653 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg @@ -0,0 +1,8 @@ +import lit.formats + +config.name = "shtest-glob" +config.suffixes = [".txt"] +config.test_format = lit.formats.ShTest() +config.test_source_root = None +config.test_exec_root = None +config.substitutions.append(("%{python}", '"%s"' % (sys.executable))) diff --git a/llvm/utils/lit/tests/shtest-glob.py b/llvm/utils/lit/tests/shtest-glob.py new file mode 100644 index 00000000000000..551331cb38e259 --- /dev/null +++ b/llvm/utils/lit/tests/shtest-glob.py @@ -0,0 +1,12 @@ +## Tests glob pattern handling in echo command. + +# RUN: not %{lit} -a -v %{inputs}/shtest-glob \ +# RUN: | FileCheck -dump-input=fail -match-full-lines %s +# +# END. + +# CHECK: UNRESOLVED: shtest-glob :: glob-echo.txt ({{[^)]*}}) +# CHECK: TypeError: string argument expected, got 'GlobItem' + +# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}} +# CHECK: # error: command failed with exit status: 1 From 9a0030e0f737fa06a4693a16d546b6336e138304 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 30 Aug 2024 09:43:20 -0700 Subject: [PATCH 64/98] [ARM] Don't use -1 as invalid register number in assembly parser. (#106666) Use MCRegister instead. --- .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 162 +++++++++--------- 1 file changed, 81 insertions(+), 81 deletions(-) diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 8b8452a2b78c80..b7dfcc15824dc7 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -441,7 +441,7 @@ class ARMAsmParser : public MCTargetAsmParser { bool validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands, unsigned MnemonicOpsEndInd, unsigned ListIndex); - int tryParseRegister(bool AllowOutofBoundReg = false); + MCRegister tryParseRegister(bool AllowOutofBoundReg = false); bool tryParseRegisterWithWriteBack(OperandVector &); int tryParseShiftRegister(OperandVector &); std::optional tryParseShiftToken(); @@ -4205,7 +4205,7 @@ bool ARMAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc, EndLoc = Tok.getEndLoc(); Reg = tryParseRegister(); - return Reg == (unsigned)-1; + return !Reg; } ParseStatus ARMAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, @@ -4216,59 +4216,59 @@ ParseStatus ARMAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, } /// Try to parse a register name. The token must be an Identifier when called, -/// and if it is a register name the token is eaten and the register number is -/// returned. Otherwise return -1. -int ARMAsmParser::tryParseRegister(bool AllowOutOfBoundReg) { +/// and if it is a register name the token is eaten and the register is +/// returned. Otherwise return an invalid MCRegister. +MCRegister ARMAsmParser::tryParseRegister(bool AllowOutOfBoundReg) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); - if (Tok.isNot(AsmToken::Identifier)) return -1; + if (Tok.isNot(AsmToken::Identifier)) + return MCRegister(); std::string lowerCase = Tok.getString().lower(); - unsigned RegNum = MatchRegisterName(lowerCase); - if (!RegNum) { - RegNum = StringSwitch(lowerCase) - .Case("r13", ARM::SP) - .Case("r14", ARM::LR) - .Case("r15", ARM::PC) - .Case("ip", ARM::R12) - // Additional register name aliases for 'gas' compatibility. - .Case("a1", ARM::R0) - .Case("a2", ARM::R1) - .Case("a3", ARM::R2) - .Case("a4", ARM::R3) - .Case("v1", ARM::R4) - .Case("v2", ARM::R5) - .Case("v3", ARM::R6) - .Case("v4", ARM::R7) - .Case("v5", ARM::R8) - .Case("v6", ARM::R9) - .Case("v7", ARM::R10) - .Case("v8", ARM::R11) - .Case("sb", ARM::R9) - .Case("sl", ARM::R10) - .Case("fp", ARM::R11) - .Default(0); - } - if (!RegNum) { + MCRegister Reg = MatchRegisterName(lowerCase); + if (!Reg) { + Reg = StringSwitch(lowerCase) + .Case("r13", ARM::SP) + .Case("r14", ARM::LR) + .Case("r15", ARM::PC) + .Case("ip", ARM::R12) + // Additional register name aliases for 'gas' compatibility. + .Case("a1", ARM::R0) + .Case("a2", ARM::R1) + .Case("a3", ARM::R2) + .Case("a4", ARM::R3) + .Case("v1", ARM::R4) + .Case("v2", ARM::R5) + .Case("v3", ARM::R6) + .Case("v4", ARM::R7) + .Case("v5", ARM::R8) + .Case("v6", ARM::R9) + .Case("v7", ARM::R10) + .Case("v8", ARM::R11) + .Case("sb", ARM::R9) + .Case("sl", ARM::R10) + .Case("fp", ARM::R11) + .Default(MCRegister()); + } + if (!Reg) { // Check for aliases registered via .req. Canonicalize to lower case. // That's more consistent since register names are case insensitive, and // it's how the original entry was passed in from MC/MCParser/AsmParser. auto Entry = RegisterReqs.find(lowerCase); // If no match, return failure. if (Entry == RegisterReqs.end()) - return -1; + return MCRegister(); Parser.Lex(); // Eat identifier token. return Entry->getValue(); } // Some FPUs only have 16 D registers, so D16-D31 are invalid - if (!AllowOutOfBoundReg && !hasD32() && RegNum >= ARM::D16 && - RegNum <= ARM::D31) - return -1; + if (!AllowOutOfBoundReg && !hasD32() && Reg >= ARM::D16 && Reg <= ARM::D31) + return MCRegister(); Parser.Lex(); // Eat identifier token. - return RegNum; + return Reg; } std::optional ARMAsmParser::tryParseShiftToken() { @@ -4356,7 +4356,7 @@ int ARMAsmParser::tryParseShiftRegister(OperandVector &Operands) { SMLoc L = Parser.getTok().getLoc(); EndLoc = Parser.getTok().getEndLoc(); ShiftReg = tryParseRegister(); - if (ShiftReg == -1) { + if (!ShiftReg) { Error(L, "expected immediate or register in shift operand"); return -1; } @@ -4387,12 +4387,11 @@ bool ARMAsmParser::tryParseRegisterWithWriteBack(OperandVector &Operands) { MCAsmParser &Parser = getParser(); SMLoc RegStartLoc = Parser.getTok().getLoc(); SMLoc RegEndLoc = Parser.getTok().getEndLoc(); - int RegNo = tryParseRegister(); - if (RegNo == -1) + MCRegister Reg = tryParseRegister(); + if (!Reg) return true; - Operands.push_back( - ARMOperand::CreateReg(RegNo, RegStartLoc, RegEndLoc, *this)); + Operands.push_back(ARMOperand::CreateReg(Reg, RegStartLoc, RegEndLoc, *this)); const AsmToken &ExclaimTok = Parser.getTok(); if (ExclaimTok.is(AsmToken::Exclaim)) { @@ -4619,8 +4618,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder, // Check the first register in the list to see what register class // this is a list of. - int Reg = tryParseRegister(); - if (Reg == -1) + MCRegister Reg = tryParseRegister(); + if (!Reg) return Error(RegLoc, "register expected"); if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE) return Error(RegLoc, "pseudo-register not allowed"); @@ -4634,7 +4633,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder, Reg = getDRegFromQReg(Reg); EReg = MRI->getEncodingValue(Reg); Registers.emplace_back(EReg, Reg); - ++Reg; + Reg = Reg + 1; } const MCRegisterClass *RC; if (Reg == ARM::RA_AUTH_CODE || @@ -4663,8 +4662,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder, return Error(RegLoc, "pseudo-register not allowed"); Parser.Lex(); // Eat the minus. SMLoc AfterMinusLoc = Parser.getTok().getLoc(); - int EndReg = tryParseRegister(AllowOutOfBoundReg); - if (EndReg == -1) + MCRegister EndReg = tryParseRegister(AllowOutOfBoundReg); + if (!EndReg) return Error(AfterMinusLoc, "register expected"); if (EndReg == ARM::RA_AUTH_CODE) return Error(AfterMinusLoc, "pseudo-register not allowed"); @@ -4696,10 +4695,10 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder, } Parser.Lex(); // Eat the comma. RegLoc = Parser.getTok().getLoc(); - int OldReg = Reg; + MCRegister OldReg = Reg; const AsmToken RegTok = Parser.getTok(); Reg = tryParseRegister(AllowOutOfBoundReg); - if (Reg == -1) + if (!Reg) return Error(RegLoc, "register expected"); if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE) return Error(RegLoc, "pseudo-register not allowed"); @@ -4755,7 +4754,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder, ") in register list"); } if (isQReg) { - EReg = MRI->getEncodingValue(++Reg); + Reg = Reg + 1; + EReg = MRI->getEncodingValue(Reg); Registers.emplace_back(EReg, Reg); } } @@ -4835,8 +4835,8 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) { // use the custom matcher to convert to list if necessary if (!hasMVE() && Parser.getTok().is(AsmToken::Identifier)) { SMLoc E = Parser.getTok().getEndLoc(); - int Reg = tryParseRegister(); - if (Reg == -1) + MCRegister Reg = tryParseRegister(); + if (!Reg) return ParseStatus::NoMatch; if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) { ParseStatus Res = parseVectorLane(LaneKind, LaneIndex, E); @@ -4889,12 +4889,12 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) { Parser.Lex(); // Eat '{' token. SMLoc RegLoc = Parser.getTok().getLoc(); - int Reg = tryParseRegister(); - if (Reg == -1) + MCRegister Reg = tryParseRegister(); + if (!Reg) return Error(RegLoc, "register expected"); unsigned Count = 1; int Spacing = 0; - unsigned FirstReg = Reg; + MCRegister FirstReg = Reg; if (hasMVE() && !ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(Reg)) return Error(Parser.getTok().getLoc(), @@ -4905,7 +4905,7 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) { FirstReg = Reg = getDRegFromQReg(Reg); Spacing = 1; // double-spacing requires explicit D registers, otherwise // it's ambiguous with four-register single spaced. - ++Reg; + Reg = Reg + 1; ++Count; } @@ -4923,8 +4923,8 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) { "sequential registers in double spaced list"); Parser.Lex(); // Eat the minus. SMLoc AfterMinusLoc = Parser.getTok().getLoc(); - int EndReg = tryParseRegister(); - if (EndReg == -1) + MCRegister EndReg = tryParseRegister(); + if (!EndReg) return Error(AfterMinusLoc, "register expected"); // Allow Q regs and just interpret them as the two D sub-registers. if (!hasMVE() && ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg)) @@ -4957,9 +4957,9 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) { } Parser.Lex(); // Eat the comma. RegLoc = Parser.getTok().getLoc(); - int OldReg = Reg; + MCRegister OldReg = Reg; Reg = tryParseRegister(); - if (Reg == -1) + if (!Reg) return Error(RegLoc, "register expected"); if (hasMVE()) { @@ -4983,7 +4983,7 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) { Reg = getDRegFromQReg(Reg); if (Reg != OldReg + 1) return Error(RegLoc, "non-contiguous register range"); - ++Reg; + Reg = Reg + 1; Count += 2; // Parse the lane specifier if present. VectorLaneTy NextLaneKind; @@ -5674,8 +5674,8 @@ ParseStatus ARMAsmParser::parsePostIdxReg(OperandVector &Operands) { } SMLoc E = Parser.getTok().getEndLoc(); - int Reg = tryParseRegister(); - if (Reg == -1) { + MCRegister Reg = tryParseRegister(); + if (!Reg) { if (!haveEaten) return ParseStatus::NoMatch; return Error(Parser.getTok().getLoc(), "register expected"); @@ -5752,8 +5752,8 @@ ParseStatus ARMAsmParser::parseAM3Offset(OperandVector &Operands) { } Tok = Parser.getTok(); - int Reg = tryParseRegister(); - if (Reg == -1) { + MCRegister Reg = tryParseRegister(); + if (!Reg) { if (!haveEaten) return ParseStatus::NoMatch; return Error(Tok.getLoc(), "register expected"); @@ -5935,8 +5935,8 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) { Parser.Lex(); // Eat left bracket token. const AsmToken &BaseRegTok = Parser.getTok(); - int BaseRegNum = tryParseRegister(); - if (BaseRegNum == -1) + MCRegister BaseReg = tryParseRegister(); + if (!BaseReg) return Error(BaseRegTok.getLoc(), "register expected"); // The next token must either be a comma, a colon or a closing bracket. @@ -5950,7 +5950,7 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) { Parser.Lex(); // Eat right bracket token. Operands.push_back(ARMOperand::CreateMem( - BaseRegNum, nullptr, 0, ARM_AM::no_shift, 0, 0, false, S, E, *this)); + BaseReg, nullptr, 0, ARM_AM::no_shift, 0, 0, false, S, E, *this)); // If there's a pre-indexing writeback marker, '!', just add it as a token // operand. It's rather odd, but syntactically valid. @@ -6006,7 +6006,7 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) { // Don't worry about range checking the value here. That's handled by // the is*() predicates. - Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, 0, + Operands.push_back(ARMOperand::CreateMem(BaseReg, nullptr, 0, ARM_AM::no_shift, 0, Align, false, S, E, *this, AlignmentLoc)); @@ -6050,7 +6050,7 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) { AdjustedOffset = CE; } else AdjustedOffset = Offset; - Operands.push_back(ARMOperand::CreateMem(BaseRegNum, AdjustedOffset, 0, + Operands.push_back(ARMOperand::CreateMem(BaseReg, AdjustedOffset, 0, ARM_AM::no_shift, 0, 0, false, S, E, *this)); @@ -6082,8 +6082,8 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) { } E = Parser.getTok().getLoc(); - int OffsetRegNum = tryParseRegister(); - if (OffsetRegNum == -1) + MCRegister OffsetReg = tryParseRegister(); + if (!OffsetReg) return Error(E, "register expected"); // If there's a shift operator, handle it. @@ -6101,7 +6101,7 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) { E = Parser.getTok().getEndLoc(); Parser.Lex(); // Eat right bracket token. - Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, OffsetRegNum, + Operands.push_back(ARMOperand::CreateMem(BaseReg, nullptr, OffsetReg, ShiftType, ShiftImm, 0, isNegative, S, E, *this)); @@ -12077,16 +12077,16 @@ bool ARMAsmParser::parseDirectiveSetFP(SMLoc L) { // Parse fpreg SMLoc FPRegLoc = Parser.getTok().getLoc(); - int FPReg = tryParseRegister(); + MCRegister FPReg = tryParseRegister(); - if (check(FPReg == -1, FPRegLoc, "frame pointer register expected") || + if (check(!FPReg, FPRegLoc, "frame pointer register expected") || Parser.parseComma()) return true; // Parse spreg SMLoc SPRegLoc = Parser.getTok().getLoc(); - int SPReg = tryParseRegister(); - if (check(SPReg == -1, SPRegLoc, "stack pointer register expected") || + MCRegister SPReg = tryParseRegister(); + if (check(!SPReg, SPRegLoc, "stack pointer register expected") || check(SPReg != ARM::SP && SPReg != UC.getFPReg(), SPRegLoc, "register should be either $sp or the latest fp register")) return true; @@ -12404,8 +12404,8 @@ bool ARMAsmParser::parseDirectiveMovSP(SMLoc L) { return Error(L, "unexpected .movsp directive"); SMLoc SPRegLoc = Parser.getTok().getLoc(); - int SPReg = tryParseRegister(); - if (SPReg == -1) + MCRegister SPReg = tryParseRegister(); + if (!SPReg) return Error(SPRegLoc, "register expected"); if (SPReg == ARM::SP || SPReg == ARM::PC) return Error(SPRegLoc, "sp and pc are not permitted in .movsp directive"); @@ -12542,8 +12542,8 @@ bool ARMAsmParser::parseDirectiveSEHSaveRegs(SMLoc L, bool Wide) { /// parseDirectiveSEHSaveSP /// ::= .seh_save_sp bool ARMAsmParser::parseDirectiveSEHSaveSP(SMLoc L) { - int Reg = tryParseRegister(); - if (Reg == -1 || !MRI->getRegClass(ARM::GPRRegClassID).contains(Reg)) + MCRegister Reg = tryParseRegister(); + if (!Reg || !MRI->getRegClass(ARM::GPRRegClassID).contains(Reg)) return Error(L, "expected GPR"); unsigned Index = MRI->getEncodingValue(Reg); if (Index > 14 || Index == 13) From 688843bda88e6dcc4f66a1283717258438dbbb96 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 30 Aug 2024 09:43:42 -0700 Subject: [PATCH 65/98] [RISCV] Add constant folding combine for FMV_X_ANYEXTW/H. (#106653) --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 7 +++++ llvm/test/CodeGen/RISCV/calling-conv-half.ll | 9 +++---- llvm/test/CodeGen/RISCV/float-imm.ll | 4 +-- llvm/test/CodeGen/RISCV/half-imm.ll | 28 ++++++++++++-------- 4 files changed, 29 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 09928dcc1f489a..33bc4b063bbb48 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -16440,6 +16440,13 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, SDLoc DL(N); SDValue Op0 = N->getOperand(0); MVT VT = N->getSimpleValueType(0); + + // Constant fold. + if (auto *CFP = dyn_cast(Op0)) { + APInt Val = CFP->getValueAPF().bitcastToAPInt().sext(VT.getSizeInBits()); + return DAG.getConstant(Val, DL, VT); + } + // If the input to FMV_X_ANYEXTW_RV64 is just FMV_W_X_RV64 then the // conversion is unnecessary and can be replaced with the FMV_W_X_RV64 // operand. Similar for FMV_X_ANYEXTH and FMV_H_X. diff --git a/llvm/test/CodeGen/RISCV/calling-conv-half.ll b/llvm/test/CodeGen/RISCV/calling-conv-half.ll index c88b2bf596ca23..cccb69d2e6986a 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-half.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-half.ll @@ -161,10 +161,8 @@ define i32 @caller_half_in_regs() nounwind { ; RV64IF: # %bb.0: ; RV64IF-NEXT: addi sp, sp, -16 ; RV64IF-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64IF-NEXT: lui a0, 1048564 -; RV64IF-NEXT: fmv.w.x fa5, a0 -; RV64IF-NEXT: fmv.x.w a1, fa5 ; RV64IF-NEXT: li a0, 1 +; RV64IF-NEXT: lui a1, 1048564 ; RV64IF-NEXT: call callee_half_in_regs ; RV64IF-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64IF-NEXT: addi sp, sp, 16 @@ -511,9 +509,8 @@ define half @callee_half_ret() nounwind { ; ; RV64IF-LABEL: callee_half_ret: ; RV64IF: # %bb.0: -; RV64IF-NEXT: lui a0, %hi(.LCPI4_0) -; RV64IF-NEXT: flw fa5, %lo(.LCPI4_0)(a0) -; RV64IF-NEXT: fmv.x.w a0, fa5 +; RV64IF-NEXT: lui a0, 1048564 +; RV64IF-NEXT: addiw a0, a0, -1024 ; RV64IF-NEXT: ret ; ; RV32-ILP32F-LABEL: callee_half_ret: diff --git a/llvm/test/CodeGen/RISCV/float-imm.ll b/llvm/test/CodeGen/RISCV/float-imm.ll index c38416d994ba57..69a506cd850f2c 100644 --- a/llvm/test/CodeGen/RISCV/float-imm.ll +++ b/llvm/test/CodeGen/RISCV/float-imm.ll @@ -24,8 +24,8 @@ define float @float_imm() nounwind { ; ; RV64ZFINX-LABEL: float_imm: ; RV64ZFINX: # %bb.0: -; RV64ZFINX-NEXT: lui a0, %hi(.LCPI0_0) -; RV64ZFINX-NEXT: lw a0, %lo(.LCPI0_0)(a0) +; RV64ZFINX-NEXT: lui a0, 263313 +; RV64ZFINX-NEXT: addiw a0, a0, -37 ; RV64ZFINX-NEXT: ret ret float 3.14159274101257324218750 } diff --git a/llvm/test/CodeGen/RISCV/half-imm.ll b/llvm/test/CodeGen/RISCV/half-imm.ll index 9c11010540e15d..4c39885176f01a 100644 --- a/llvm/test/CodeGen/RISCV/half-imm.ll +++ b/llvm/test/CodeGen/RISCV/half-imm.ll @@ -15,10 +15,10 @@ ; RUN: -target-abi lp64f < %s | FileCheck -check-prefixes=CHECKIZFHMIN %s ; RUN: llc -mtriple=riscv32 -mattr=+zhinxmin -verify-machineinstrs \ ; RUN: -target-abi ilp32 < %s \ -; RUN: | FileCheck -check-prefixes=CHECKIZHINXMIN %s +; RUN: | FileCheck -check-prefixes=CHECKIZHINXMIN,RV32IZHINXMIN %s ; RUN: llc -mtriple=riscv64 -mattr=+zhinxmin -verify-machineinstrs \ ; RUN: -target-abi lp64 < %s \ -; RUN: | FileCheck -check-prefixes=CHECKIZHINXMIN %s +; RUN: | FileCheck -check-prefixes=CHECKIZHINXMIN,RV64IZHINXMIN %s ; TODO: constant pool shouldn't be necessary for RV32IZfh and RV64IZfh define half @half_imm() nounwind { @@ -30,14 +30,14 @@ define half @half_imm() nounwind { ; ; RV32IZHINX-LABEL: half_imm: ; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lui a0, %hi(.LCPI0_0) -; RV32IZHINX-NEXT: lh a0, %lo(.LCPI0_0)(a0) +; RV32IZHINX-NEXT: lui a0, 4 +; RV32IZHINX-NEXT: addi a0, a0, 512 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: half_imm: ; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lui a0, %hi(.LCPI0_0) -; RV64IZHINX-NEXT: lh a0, %lo(.LCPI0_0)(a0) +; RV64IZHINX-NEXT: lui a0, 4 +; RV64IZHINX-NEXT: addiw a0, a0, 512 ; RV64IZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: half_imm: @@ -46,11 +46,17 @@ define half @half_imm() nounwind { ; CHECKIZFHMIN-NEXT: flh fa0, %lo(.LCPI0_0)(a0) ; CHECKIZFHMIN-NEXT: ret ; -; CHECKIZHINXMIN-LABEL: half_imm: -; CHECKIZHINXMIN: # %bb.0: -; CHECKIZHINXMIN-NEXT: lui a0, %hi(.LCPI0_0) -; CHECKIZHINXMIN-NEXT: lh a0, %lo(.LCPI0_0)(a0) -; CHECKIZHINXMIN-NEXT: ret +; RV32IZHINXMIN-LABEL: half_imm: +; RV32IZHINXMIN: # %bb.0: +; RV32IZHINXMIN-NEXT: lui a0, 4 +; RV32IZHINXMIN-NEXT: addi a0, a0, 512 +; RV32IZHINXMIN-NEXT: ret +; +; RV64IZHINXMIN-LABEL: half_imm: +; RV64IZHINXMIN: # %bb.0: +; RV64IZHINXMIN-NEXT: lui a0, 4 +; RV64IZHINXMIN-NEXT: addiw a0, a0, 512 +; RV64IZHINXMIN-NEXT: ret ret half 3.0 } From c25293c6dd9a71b4655d1d6497ab8576c15e446e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 30 Aug 2024 09:44:51 -0700 Subject: [PATCH 66/98] [LegalizeVectorOps][RISCV] Don't promote VP_FABS/FNEG/FCOPYSIGN. (#106659) Promoting canonicalizes NaNs which changes the semantics. Bitcast to integer and use logic ops instead. --- .../SelectionDAG/LegalizeVectorOps.cpp | 100 ++++++++ llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 34 ++- .../RISCV/rvv/fixed-vectors-vfabs-vp.ll | 80 +++---- .../RISCV/rvv/fixed-vectors-vfneg-vp.ll | 72 ++---- llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll | 226 ++++++------------ llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll | 160 ++++--------- llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll | 148 +++--------- 7 files changed, 336 insertions(+), 484 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 2557fa288606e7..87221c14433ab5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -135,6 +135,9 @@ class VectorLegalizer { SDValue ExpandVP_SELECT(SDNode *Node); SDValue ExpandVP_MERGE(SDNode *Node); SDValue ExpandVP_REM(SDNode *Node); + SDValue ExpandVP_FNEG(SDNode *Node); + SDValue ExpandVP_FABS(SDNode *Node); + SDValue ExpandVP_FCOPYSIGN(SDNode *Node); SDValue ExpandSELECT(SDNode *Node); std::pair ExpandLoad(SDNode *N); SDValue ExpandStore(SDNode *N); @@ -699,6 +702,11 @@ void VectorLegalizer::Promote(SDNode *Node, SmallVectorImpl &Results) { // These operations are used to do promotion so they can't be promoted // themselves. llvm_unreachable("Don't know how to promote this operation!"); + case ISD::VP_FABS: + case ISD::VP_FCOPYSIGN: + case ISD::VP_FNEG: + // Promoting fabs, fneg, and fcopysign changes their semantics. + llvm_unreachable("These operations should not be promoted"); } // There are currently two cases of vector promotion: @@ -887,6 +895,24 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { return; } break; + case ISD::VP_FNEG: + if (SDValue Expanded = ExpandVP_FNEG(Node)) { + Results.push_back(Expanded); + return; + } + break; + case ISD::VP_FABS: + if (SDValue Expanded = ExpandVP_FABS(Node)) { + Results.push_back(Expanded); + return; + } + break; + case ISD::VP_FCOPYSIGN: + if (SDValue Expanded = ExpandVP_FCOPYSIGN(Node)) { + Results.push_back(Expanded); + return; + } + break; case ISD::SELECT: Results.push_back(ExpandSELECT(Node)); return; @@ -1557,6 +1583,80 @@ SDValue VectorLegalizer::ExpandVP_REM(SDNode *Node) { return DAG.getNode(ISD::VP_SUB, DL, VT, Dividend, Mul, Mask, EVL); } +SDValue VectorLegalizer::ExpandVP_FNEG(SDNode *Node) { + EVT VT = Node->getValueType(0); + EVT IntVT = VT.changeVectorElementTypeToInteger(); + + if (!TLI.isOperationLegalOrCustom(ISD::VP_XOR, IntVT)) + return SDValue(); + + SDValue Mask = Node->getOperand(1); + SDValue EVL = Node->getOperand(2); + + SDLoc DL(Node); + SDValue Cast = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(0)); + SDValue SignMask = DAG.getConstant( + APInt::getSignMask(IntVT.getScalarSizeInBits()), DL, IntVT); + SDValue Xor = DAG.getNode(ISD::VP_XOR, DL, IntVT, Cast, SignMask, Mask, EVL); + return DAG.getNode(ISD::BITCAST, DL, VT, Xor); +} + +SDValue VectorLegalizer::ExpandVP_FABS(SDNode *Node) { + EVT VT = Node->getValueType(0); + EVT IntVT = VT.changeVectorElementTypeToInteger(); + + if (!TLI.isOperationLegalOrCustom(ISD::VP_AND, IntVT)) + return SDValue(); + + SDValue Mask = Node->getOperand(1); + SDValue EVL = Node->getOperand(2); + + SDLoc DL(Node); + SDValue Cast = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(0)); + SDValue ClearSignMask = DAG.getConstant( + APInt::getSignedMaxValue(IntVT.getScalarSizeInBits()), DL, IntVT); + SDValue ClearSign = + DAG.getNode(ISD::VP_AND, DL, IntVT, Cast, ClearSignMask, Mask, EVL); + return DAG.getNode(ISD::BITCAST, DL, VT, ClearSign); +} + +SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) { + EVT VT = Node->getValueType(0); + + if (VT != Node->getOperand(1).getValueType()) + return SDValue(); + + EVT IntVT = VT.changeVectorElementTypeToInteger(); + if (!TLI.isOperationLegalOrCustom(ISD::VP_AND, IntVT) || + !TLI.isOperationLegalOrCustom(ISD::VP_XOR, IntVT)) + return SDValue(); + + SDValue Mask = Node->getOperand(2); + SDValue EVL = Node->getOperand(3); + + SDLoc DL(Node); + SDValue Mag = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(0)); + SDValue Sign = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(1)); + + SDValue SignMask = DAG.getConstant( + APInt::getSignMask(IntVT.getScalarSizeInBits()), DL, IntVT); + SDValue SignBit = + DAG.getNode(ISD::VP_AND, DL, IntVT, Sign, SignMask, Mask, EVL); + + SDValue ClearSignMask = DAG.getConstant( + APInt::getSignedMaxValue(IntVT.getScalarSizeInBits()), DL, IntVT); + SDValue ClearedSign = + DAG.getNode(ISD::VP_AND, DL, IntVT, Mag, ClearSignMask, Mask, EVL); + + SDNodeFlags Flags; + Flags.setDisjoint(true); + + SDValue CopiedSign = DAG.getNode(ISD::VP_OR, DL, IntVT, ClearedSign, SignBit, + Mask, EVL, Flags); + + return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign); +} + void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node, SmallVectorImpl &Results) { // Attempt to expand using TargetLowering. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 33bc4b063bbb48..e990325ac38279 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -891,16 +891,30 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::STRICT_FDIV, ISD::STRICT_FSQRT, ISD::STRICT_FMA}; // TODO: support more vp ops. - static const unsigned ZvfhminPromoteVPOps[] = { - ISD::VP_FADD, ISD::VP_FSUB, ISD::VP_FMUL, - ISD::VP_FDIV, ISD::VP_FNEG, ISD::VP_FABS, - ISD::VP_FMA, ISD::VP_REDUCE_FADD, ISD::VP_REDUCE_SEQ_FADD, - ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX, ISD::VP_SQRT, - ISD::VP_FMINNUM, ISD::VP_FMAXNUM, ISD::VP_FCEIL, - ISD::VP_FFLOOR, ISD::VP_FROUND, ISD::VP_FROUNDEVEN, - ISD::VP_FCOPYSIGN, ISD::VP_FROUNDTOZERO, ISD::VP_FRINT, - ISD::VP_FNEARBYINT, ISD::VP_SETCC, ISD::VP_FMINIMUM, - ISD::VP_FMAXIMUM, ISD::VP_REDUCE_FMINIMUM, ISD::VP_REDUCE_FMAXIMUM}; + static const unsigned ZvfhminPromoteVPOps[] = {ISD::VP_FADD, + ISD::VP_FSUB, + ISD::VP_FMUL, + ISD::VP_FDIV, + ISD::VP_FMA, + ISD::VP_REDUCE_FADD, + ISD::VP_REDUCE_SEQ_FADD, + ISD::VP_REDUCE_FMIN, + ISD::VP_REDUCE_FMAX, + ISD::VP_SQRT, + ISD::VP_FMINNUM, + ISD::VP_FMAXNUM, + ISD::VP_FCEIL, + ISD::VP_FFLOOR, + ISD::VP_FROUND, + ISD::VP_FROUNDEVEN, + ISD::VP_FROUNDTOZERO, + ISD::VP_FRINT, + ISD::VP_FNEARBYINT, + ISD::VP_SETCC, + ISD::VP_FMINIMUM, + ISD::VP_FMAXIMUM, + ISD::VP_REDUCE_FMINIMUM, + ISD::VP_REDUCE_FMAXIMUM}; // Sets common operation actions on RVV floating-point vector types. const auto SetCommonVFPActions = [&](MVT VT) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll index ae3dce497c6d07..90a856605c70d8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll @@ -19,12 +19,10 @@ define <2 x half> @vfabs_vv_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) ; ; ZVFHMIN-LABEL: vfabs_vv_v2f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfabs.v v9, v9, v0.t -; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret %v = call <2 x half> @llvm.vp.fabs.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl) ret <2 x half> %v @@ -39,12 +37,10 @@ define <2 x half> @vfabs_vv_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ; ZVFHMIN-LABEL: vfabs_vv_v2f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfabs.v v9, v9 -; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call <2 x half> @llvm.vp.fabs.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x half> %v @@ -61,12 +57,10 @@ define <4 x half> @vfabs_vv_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) ; ; ZVFHMIN-LABEL: vfabs_vv_v4f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfabs.v v9, v9, v0.t -; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret %v = call <4 x half> @llvm.vp.fabs.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl) ret <4 x half> %v @@ -81,12 +75,10 @@ define <4 x half> @vfabs_vv_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ; ZVFHMIN-LABEL: vfabs_vv_v4f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfabs.v v9, v9 -; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call <4 x half> @llvm.vp.fabs.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x half> %v @@ -103,12 +95,10 @@ define <8 x half> @vfabs_vv_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) ; ; ZVFHMIN-LABEL: vfabs_vv_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfabs.v v10, v10, v0.t -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret %v = call <8 x half> @llvm.vp.fabs.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl) ret <8 x half> %v @@ -123,12 +113,10 @@ define <8 x half> @vfabs_vv_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ; ZVFHMIN-LABEL: vfabs_vv_v8f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfabs.v v10, v10 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call <8 x half> @llvm.vp.fabs.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x half> %v @@ -145,12 +133,10 @@ define <16 x half> @vfabs_vv_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % ; ; ZVFHMIN-LABEL: vfabs_vv_v16f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfabs.v v12, v12, v0.t -; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret %v = call <16 x half> @llvm.vp.fabs.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl) ret <16 x half> %v @@ -165,12 +151,10 @@ define <16 x half> @vfabs_vv_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) ; ; ZVFHMIN-LABEL: vfabs_vv_v16f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfabs.v v12, v12 -; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call <16 x half> @llvm.vp.fabs.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x half> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll index fbc4c56a911340..019923ffdfdedf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll @@ -19,12 +19,9 @@ define <2 x half> @vfneg_vv_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) ; ; ZVFHMIN-LABEL: vfneg_vv_v2f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfneg.v v9, v9, v0.t -; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret %v = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl) ret <2 x half> %v @@ -39,12 +36,9 @@ define <2 x half> @vfneg_vv_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ; ZVFHMIN-LABEL: vfneg_vv_v2f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfneg.v v9, v9 -; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x half> %v @@ -61,12 +55,9 @@ define <4 x half> @vfneg_vv_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) ; ; ZVFHMIN-LABEL: vfneg_vv_v4f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfneg.v v9, v9, v0.t -; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret %v = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl) ret <4 x half> %v @@ -81,12 +72,9 @@ define <4 x half> @vfneg_vv_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ; ZVFHMIN-LABEL: vfneg_vv_v4f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfneg.v v9, v9 -; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x half> %v @@ -103,12 +91,9 @@ define <8 x half> @vfneg_vv_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) ; ; ZVFHMIN-LABEL: vfneg_vv_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfneg.v v10, v10, v0.t -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret %v = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl) ret <8 x half> %v @@ -123,12 +108,9 @@ define <8 x half> @vfneg_vv_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ; ZVFHMIN-LABEL: vfneg_vv_v8f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfneg.v v10, v10 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x half> %v @@ -145,12 +127,9 @@ define <16 x half> @vfneg_vv_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % ; ; ZVFHMIN-LABEL: vfneg_vv_v16f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfneg.v v12, v12, v0.t -; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t ; ZVFHMIN-NEXT: ret %v = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl) ret <16 x half> %v @@ -165,12 +144,9 @@ define <16 x half> @vfneg_vv_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) ; ; ZVFHMIN-LABEL: vfneg_vv_v16f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfneg.v v12, v12 -; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x half> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll index 6e34d59a2d9894..e8a7d790758596 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll @@ -19,13 +19,12 @@ define @vfsgnj_vv_nxv1f16( %va, @llvm.vp.copysign.nxv1f16( %va, %vb, %m, i32 %evl) ret %v @@ -40,13 +39,12 @@ define @vfsgnj_vv_nxv1f16_unmasked( %va, ; ; ZVFHMIN-LABEL: vfsgnj_vv_nxv1f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v9, v9, v10 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.copysign.nxv1f16( %va, %vb, splat (i1 true), i32 %evl) ret %v @@ -63,13 +61,12 @@ define @vfsgnj_vv_nxv2f16( %va, @llvm.vp.copysign.nxv2f16( %va, %vb, %m, i32 %evl) ret %v @@ -84,13 +81,12 @@ define @vfsgnj_vv_nxv2f16_unmasked( %va, ; ; ZVFHMIN-LABEL: vfsgnj_vv_nxv2f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v9, v9, v10 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.copysign.nxv2f16( %va, %vb, splat (i1 true), i32 %evl) ret %v @@ -107,13 +103,12 @@ define @vfsgnj_vv_nxv4f16( %va, @llvm.vp.copysign.nxv4f16( %va, %vb, %m, i32 %evl) ret %v @@ -128,13 +123,12 @@ define @vfsgnj_vv_nxv4f16_unmasked( %va, ; ; ZVFHMIN-LABEL: vfsgnj_vv_nxv4f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v10, v12, v10 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.copysign.nxv4f16( %va, %vb, splat (i1 true), i32 %evl) ret %v @@ -151,13 +145,12 @@ define @vfsgnj_vv_nxv8f16( %va, @llvm.vp.copysign.nxv8f16( %va, %vb, %m, i32 %evl) ret %v @@ -172,13 +165,12 @@ define @vfsgnj_vv_nxv8f16_unmasked( %va, ; ; ZVFHMIN-LABEL: vfsgnj_vv_nxv8f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v12, v16, v12 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vand.vx v10, v10, a1 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v10 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.copysign.nxv8f16( %va, %vb, splat (i1 true), i32 %evl) ret %v @@ -195,13 +187,12 @@ define @vfsgnj_vv_nxv16f16( %va, @llvm.vp.copysign.nxv16f16( %va, %vb, %m, i32 %evl) ret %v @@ -216,13 +207,12 @@ define @vfsgnj_vv_nxv16f16_unmasked( %v ; ; ZVFHMIN-LABEL: vfsgnj_vv_nxv16f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vand.vx v12, v12, a1 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v12 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.copysign.nxv16f16( %va, %vb, splat (i1 true), i32 %evl) ret %v @@ -239,48 +229,12 @@ define @vfsgnj_vv_nxv32f16( %va, @llvm.vp.copysign.nxv32f16( %va, %vb, %m, i32 %evl) ret %v @@ -295,48 +249,12 @@ define @vfsgnj_vv_nxv32f16_unmasked( %v ; ; ZVFHMIN-LABEL: vfsgnj_vv_nxv32f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 -; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2 -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v16, v16, v24, v0.t -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB11_2: -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vand.vx v16, v16, a1 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v16 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.copysign.nxv32f16( %va, %vb, splat (i1 true), i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll index 0f7e3f1e0ea5a2..b9be6eb1fa3737 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll @@ -19,12 +19,10 @@ define @vfabs_vv_nxv1f16( %va, @llvm.vp.fabs.nxv1f16( %va, %m, i32 %evl) ret %v @@ -39,12 +37,10 @@ define @vfabs_vv_nxv1f16_unmasked( %va, i ; ; ZVFHMIN-LABEL: vfabs_vv_nxv1f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfabs.v v9, v9 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.fabs.nxv1f16( %va, splat (i1 true), i32 %evl) ret %v @@ -61,12 +57,10 @@ define @vfabs_vv_nxv2f16( %va, @llvm.vp.fabs.nxv2f16( %va, %m, i32 %evl) ret %v @@ -81,12 +75,10 @@ define @vfabs_vv_nxv2f16_unmasked( %va, i ; ; ZVFHMIN-LABEL: vfabs_vv_nxv2f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfabs.v v9, v9 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.fabs.nxv2f16( %va, splat (i1 true), i32 %evl) ret %v @@ -103,12 +95,10 @@ define @vfabs_vv_nxv4f16( %va, @llvm.vp.fabs.nxv4f16( %va, %m, i32 %evl) ret %v @@ -123,12 +113,10 @@ define @vfabs_vv_nxv4f16_unmasked( %va, i ; ; ZVFHMIN-LABEL: vfabs_vv_nxv4f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfabs.v v10, v10 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.fabs.nxv4f16( %va, splat (i1 true), i32 %evl) ret %v @@ -145,12 +133,10 @@ define @vfabs_vv_nxv8f16( %va, @llvm.vp.fabs.nxv8f16( %va, %m, i32 %evl) ret %v @@ -165,12 +151,10 @@ define @vfabs_vv_nxv8f16_unmasked( %va, i ; ; ZVFHMIN-LABEL: vfabs_vv_nxv8f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfabs.v v12, v12 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.fabs.nxv8f16( %va, splat (i1 true), i32 %evl) ret %v @@ -187,12 +171,10 @@ define @vfabs_vv_nxv16f16( %va, @llvm.vp.fabs.nxv16f16( %va, %m, i32 %evl) ret %v @@ -207,12 +189,10 @@ define @vfabs_vv_nxv16f16_unmasked( %va ; ; ZVFHMIN-LABEL: vfabs_vv_nxv16f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v16, v16 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.fabs.nxv16f16( %va, splat (i1 true), i32 %evl) ret %v @@ -229,32 +209,10 @@ define @vfabs_vv_nxv32f16( %va, @llvm.vp.fabs.nxv32f16( %va, %m, i32 %evl) ret %v @@ -269,32 +227,10 @@ define @vfabs_vv_nxv32f16_unmasked( %va ; ; ZVFHMIN-LABEL: vfabs_vv_nxv32f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 -; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v16 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v16, a2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v16, v16, v0.t -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB11_2: -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v16, v16 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.fabs.nxv32f16( %va, splat (i1 true), i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll index 69ea7ce33cf6b6..af2668a9b0c545 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll @@ -19,12 +19,9 @@ define @vfneg_vv_nxv1f16( %va, @llvm.vp.fneg.nxv1f16( %va, %m, i32 %evl) ret %v @@ -39,12 +36,9 @@ define @vfneg_vv_nxv1f16_unmasked( %va, i ; ; ZVFHMIN-LABEL: vfneg_vv_nxv1f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfneg.v v9, v9 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.fneg.nxv1f16( %va, splat (i1 true), i32 %evl) ret %v @@ -61,12 +55,9 @@ define @vfneg_vv_nxv2f16( %va, @llvm.vp.fneg.nxv2f16( %va, %m, i32 %evl) ret %v @@ -81,12 +72,9 @@ define @vfneg_vv_nxv2f16_unmasked( %va, i ; ; ZVFHMIN-LABEL: vfneg_vv_nxv2f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfneg.v v9, v9 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.fneg.nxv2f16( %va, splat (i1 true), i32 %evl) ret %v @@ -103,12 +91,9 @@ define @vfneg_vv_nxv4f16( %va, @llvm.vp.fneg.nxv4f16( %va, %m, i32 %evl) ret %v @@ -123,12 +108,9 @@ define @vfneg_vv_nxv4f16_unmasked( %va, i ; ; ZVFHMIN-LABEL: vfneg_vv_nxv4f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfneg.v v10, v10 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.fneg.nxv4f16( %va, splat (i1 true), i32 %evl) ret %v @@ -145,12 +127,9 @@ define @vfneg_vv_nxv8f16( %va, @llvm.vp.fneg.nxv8f16( %va, %m, i32 %evl) ret %v @@ -165,12 +144,9 @@ define @vfneg_vv_nxv8f16_unmasked( %va, i ; ; ZVFHMIN-LABEL: vfneg_vv_nxv8f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfneg.v v12, v12 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.fneg.nxv8f16( %va, splat (i1 true), i32 %evl) ret %v @@ -187,12 +163,9 @@ define @vfneg_vv_nxv16f16( %va, @llvm.vp.fneg.nxv16f16( %va, %m, i32 %evl) ret %v @@ -207,12 +180,9 @@ define @vfneg_vv_nxv16f16_unmasked( %va ; ; ZVFHMIN-LABEL: vfneg_vv_nxv16f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfneg.v v16, v16 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.fneg.nxv16f16( %va, splat (i1 true), i32 %evl) ret %v @@ -229,32 +199,9 @@ define @vfneg_vv_nxv32f16( %va, @llvm.vp.fneg.nxv32f16( %va, %m, i32 %evl) ret %v @@ -269,32 +216,9 @@ define @vfneg_vv_nxv32f16_unmasked( %va ; ; ZVFHMIN-LABEL: vfneg_vv_nxv32f16_unmasked: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a1, a2, 1 -; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: sltu a4, a0, a3 -; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: and a3, a4, a3 -; ZVFHMIN-NEXT: srli a2, a2, 2 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v16 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v16, a2 -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfneg.v v16, v16, v0.t -; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2 -; ZVFHMIN-NEXT: # %bb.1: -; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB11_2: -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfneg.v v16, v16 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.fneg.nxv32f16( %va, splat (i1 true), i32 %evl) ret %v From 5b3ba438dfa7815bb0f3be07a300866085a431b9 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 30 Aug 2024 09:43:02 -0700 Subject: [PATCH 67/98] Restructure createSimpleTargetReduction to match VP path [NFC] Reduces code significantly, but more importantly makes it obvious that this variant matches the VP variant just below. --- llvm/lib/Transforms/Utils/LoopUtils.cpp | 26 +++++++------------------ 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index f1f2d522f1cbaa..a49d3b0b990bc7 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1212,37 +1212,25 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src, auto *SrcVecEltTy = cast(Src->getType())->getElementType(); switch (RdxKind) { case RecurKind::Add: - return Builder.CreateAddReduce(Src); case RecurKind::Mul: - return Builder.CreateMulReduce(Src); case RecurKind::And: - return Builder.CreateAndReduce(Src); case RecurKind::Or: - return Builder.CreateOrReduce(Src); case RecurKind::Xor: - return Builder.CreateXorReduce(Src); - case RecurKind::FMulAdd: - case RecurKind::FAdd: - return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy), - Src); - case RecurKind::FMul: - return Builder.CreateFMulReduce(ConstantFP::get(SrcVecEltTy, 1.0), Src); case RecurKind::SMax: - return Builder.CreateIntMaxReduce(Src, true); case RecurKind::SMin: - return Builder.CreateIntMinReduce(Src, true); case RecurKind::UMax: - return Builder.CreateIntMaxReduce(Src, false); case RecurKind::UMin: - return Builder.CreateIntMinReduce(Src, false); case RecurKind::FMax: - return Builder.CreateFPMaxReduce(Src); case RecurKind::FMin: - return Builder.CreateFPMinReduce(Src); case RecurKind::FMinimum: - return Builder.CreateFPMinimumReduce(Src); case RecurKind::FMaximum: - return Builder.CreateFPMaximumReduce(Src); + return Builder.CreateUnaryIntrinsic(getReductionIntrinsicID(RdxKind), Src); + case RecurKind::FMulAdd: + case RecurKind::FAdd: + return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy), + Src); + case RecurKind::FMul: + return Builder.CreateFMulReduce(ConstantFP::get(SrcVecEltTy, 1.0), Src); default: llvm_unreachable("Unhandled opcode"); } From 9aa25b8c15c99d8e717121837a2559801e311e2d Mon Sep 17 00:00:00 2001 From: Walter Erquinigo Date: Fri, 30 Aug 2024 12:51:56 -0400 Subject: [PATCH 68/98] [LLDB][DWARF] Add an option to silence unsupported DW_FORM warnings (#106609) My build of LLDB is all the time loading targets with a version of libc++ that was built with gcc that uses the DW_FORM 0x1e that is not implemented by LLVM, and I doubt it'll ever implement it. It's used for some 128 bit encoding of numbers, which is just very weird. Because of this, LLDB is showing some warnings all the time for my users, so I'm adding a flag to control the enablement of this warning. --- .../SymbolFile/DWARF/SymbolFileDWARF.cpp | 30 ++++++++++++------- .../DWARF/SymbolFileDWARFProperties.td | 4 +++ 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index ff44329d081caa..2af6dc880842a4 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -87,7 +87,7 @@ #include #include -//#define ENABLE_DEBUG_PRINTF // COMMENT OUT THIS LINE PRIOR TO CHECKIN +// #define ENABLE_DEBUG_PRINTF // COMMENT OUT THIS LINE PRIOR TO CHECKIN #ifdef ENABLE_DEBUG_PRINTF #include @@ -129,6 +129,11 @@ class PluginProperties : public Properties { bool IgnoreFileIndexes() const { return GetPropertyAtIndexAs(ePropertyIgnoreIndexes, false); } + + bool EmitUnsupportedDWFormValueWarning() const { + return GetPropertyAtIndexAs( + ePropertyEmitUnsupportedDWFormValueWarning, true); + } }; } // namespace @@ -624,12 +629,14 @@ uint32_t SymbolFileDWARF::CalculateAbilities() { llvm::DWARFDebugAbbrev *abbrev = DebugAbbrev(); std::set unsupported_forms = GetUnsupportedForms(abbrev); if (!unsupported_forms.empty()) { - StreamString error; - error.Printf("unsupported DW_FORM value%s:", - unsupported_forms.size() > 1 ? "s" : ""); - for (auto form : unsupported_forms) - error.Printf(" %#x", form); - m_objfile_sp->GetModule()->ReportWarning("{0}", error.GetString()); + if (GetGlobalPluginProperties().EmitUnsupportedDWFormValueWarning()) { + StreamString error; + error.Printf("unsupported DW_FORM value%s:", + unsupported_forms.size() > 1 ? "s" : ""); + for (auto form : unsupported_forms) + error.Printf(" %#x", form); + m_objfile_sp->GetModule()->ReportWarning("{0}", error.GetString()); + } return 0; } @@ -1770,16 +1777,17 @@ SymbolFileDWARF *SymbolFileDWARF::GetDIERefSymbolFile(const DIERef &die_ref) { return this; if (file_index) { - // We have a SymbolFileDWARFDebugMap, so let it find the right file + // We have a SymbolFileDWARFDebugMap, so let it find the right file if (SymbolFileDWARFDebugMap *debug_map = GetDebugMapSymfile()) return debug_map->GetSymbolFileByOSOIndex(*file_index); - + // Handle the .dwp file case correctly if (*file_index == DIERef::k_file_index_mask) return GetDwpSymbolFile().get(); // DWP case // Handle the .dwo file case correctly - return DebugInfo().GetUnitAtIndex(*die_ref.file_index()) + return DebugInfo() + .GetUnitAtIndex(*die_ref.file_index()) ->GetDwoSymbolFile(); // DWO case } return this; @@ -3621,7 +3629,7 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc, lldb::addr_t location_DW_OP_addr = LLDB_INVALID_ADDRESS; if (!location_is_const_value_data) { bool op_error = false; - const DWARFExpression* location = location_list.GetAlwaysValidExpr(); + const DWARFExpression *location = location_list.GetAlwaysValidExpr(); if (location) location_DW_OP_addr = location->GetLocation_DW_OP_addr(location_form.GetUnit(), op_error); diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFProperties.td b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFProperties.td index 2f1ce88808b763..0f980a514b6720 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFProperties.td +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFProperties.td @@ -5,4 +5,8 @@ let Definition = "symbolfiledwarf" in { Global, DefaultFalse, Desc<"Ignore indexes present in the object files and always index DWARF manually.">; + def EmitUnsupportedDWFormValueWarning: Property<"emit-unsupported-dwform-value", "Boolean">, + Global, + DefaultTrue, + Desc<"Emit warnings about unsupported DW_Form values.">; } From 26f6091dc9c24bdf22390f2b9f68aacc4669ef36 Mon Sep 17 00:00:00 2001 From: Xiang Li Date: Fri, 30 Aug 2024 09:52:26 -0700 Subject: [PATCH 69/98] [DirectX] Replace ResourceFlag enum with struct fields (#106617) Remove the enum about ResourceFlag. Add struct ResourceFlags which save the resource flags with bool fields. This will get better yaml dump. For #103275 --- llvm/include/llvm/BinaryFormat/DXContainer.h | 20 ++++++++++++------- .../BinaryFormat/DXContainerConstants.def | 3 +-- .../include/llvm/ObjectYAML/DXContainerYAML.h | 6 +++++- llvm/lib/BinaryFormat/DXContainer.cpp | 10 ---------- llvm/lib/ObjectYAML/DXContainerYAML.cpp | 12 +++++------ .../DXContainer/DomainMaskVectors.yaml | 3 ++- .../DXContainer/PSVv2-amplification.yaml | 12 +++++++---- .../ObjectYAML/DXContainer/PSVv2-compute.yaml | 12 +++++++---- .../ObjectYAML/DXContainer/PSVv2-domain.yaml | 12 +++++++---- .../DXContainer/PSVv2-geometry.yaml | 12 +++++++---- .../ObjectYAML/DXContainer/PSVv2-hull.yaml | 12 +++++++---- .../ObjectYAML/DXContainer/PSVv2-mesh.yaml | 12 +++++++---- .../ObjectYAML/DXContainer/PSVv2-pixel.yaml | 12 +++++++---- .../ObjectYAML/DXContainer/PSVv2-vertex.yaml | 12 +++++++---- .../DXContainer/PSVv3-amplification.yaml | 12 +++++++---- .../ObjectYAML/DXContainer/PSVv3-compute.yaml | 12 +++++++---- .../ObjectYAML/DXContainer/PSVv3-domain.yaml | 12 +++++++---- .../DXContainer/PSVv3-geometry.yaml | 12 +++++++---- .../ObjectYAML/DXContainer/PSVv3-hull.yaml | 12 +++++++---- .../ObjectYAML/DXContainer/PSVv3-mesh.yaml | 12 +++++++---- .../ObjectYAML/DXContainer/PSVv3-pixel.yaml | 12 +++++++---- .../ObjectYAML/DXContainer/PSVv3-vertex.yaml | 12 +++++++---- 22 files changed, 155 insertions(+), 91 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index a4cc814549c95b..21e28d546286ee 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -313,13 +313,19 @@ enum class ResourceKind : uint32_t { ArrayRef> getResourceKinds(); -#define RESOURCE_FLAG(Val, Enum) Enum = Val, -enum class ResourceFlag : uint32_t { -#include "DXContainerConstants.def" +#define RESOURCE_FLAG(Index, Enum) bool Enum = false; +struct ResourceFlags { + ResourceFlags() {}; + struct FlagsBits { +#include "llvm/BinaryFormat/DXContainerConstants.def" + }; + union { + uint32_t Flags; + FlagsBits Bits; + }; + bool operator==(const uint32_t RFlags) const { return Flags == RFlags; } }; -ArrayRef> getResourceFlags(); - namespace v0 { struct RuntimeInfo { PipelinePSVInfo StageInfo; @@ -439,12 +445,12 @@ struct RuntimeInfo : public v1::RuntimeInfo { struct ResourceBindInfo : public v0::ResourceBindInfo { ResourceKind Kind; - uint32_t Flags; + ResourceFlags Flags; void swapBytes() { v0::ResourceBindInfo::swapBytes(); sys::swapByteOrder(Kind); - sys::swapByteOrder(Flags); + sys::swapByteOrder(Flags.Flags); } }; diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index 4111cecb018bb3..1aacbb2f65b27f 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -190,8 +190,7 @@ RESOURCE_KIND(18, FeedbackTexture2DArray) #endif // RESOURCE_KIND #ifdef RESOURCE_FLAG -RESOURCE_FLAG(0, None) -RESOURCE_FLAG(1, UsedByAtomic64) +RESOURCE_FLAG(0, UsedByAtomic64) #undef RESOURCE_FLAG #endif // RESOURCE_FLAG diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index e432359b7bbd07..66ad057ab0e30f 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -72,6 +72,7 @@ struct ShaderHash { std::vector Digest; }; +using ResourceFlags = dxbc::PSV::ResourceFlags; using ResourceBindInfo = dxbc::PSV::v2::ResourceBindInfo; struct SignatureElement { @@ -178,7 +179,6 @@ LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ComponentType) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::InterpolationMode) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ResourceType) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ResourceKind) -LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ResourceFlag) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::D3DSystemValue) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigComponentType) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigMinPrecision) @@ -221,6 +221,10 @@ template <> struct MappingTraits { static void mapping(IO &IO, DXContainerYAML::Object &Obj); }; +template <> struct MappingTraits { + static void mapping(IO &IO, DXContainerYAML::ResourceFlags &Flags); +}; + template <> struct MappingTraits { static void mapping(IO &IO, DXContainerYAML::ResourceBindInfo &Res); }; diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp index 790947cc729c0b..97ceb16ccf53f4 100644 --- a/llvm/lib/BinaryFormat/DXContainer.cpp +++ b/llvm/lib/BinaryFormat/DXContainer.cpp @@ -109,13 +109,3 @@ static const EnumEntry ResourceKindNames[] = { ArrayRef> PSV::getResourceKinds() { return ArrayRef(ResourceKindNames); } - -#define RESOURCE_FLAG(Val, Enum) {#Enum, PSV::ResourceFlag::Enum}, - -static const EnumEntry ResourceFlagNames[] = { -#include "llvm/BinaryFormat/DXContainerConstants.def" -}; - -ArrayRef> PSV::getResourceFlags() { - return ArrayRef(ResourceFlagNames); -} diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 21a966d5abd132..5dee1221b27c01 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -206,6 +206,12 @@ void MappingTraits::mapping( IO.mapRequired("Parts", Obj.Parts); } +void MappingTraits::mapping( + IO &IO, DXContainerYAML::ResourceFlags &Flags) { +#define RESOURCE_FLAG(FlagIndex, Enum) IO.mapRequired(#Enum, Flags.Bits.Enum); +#include "llvm/BinaryFormat/DXContainerConstants.def" +} + void MappingTraits::mapping( IO &IO, DXContainerYAML::ResourceBindInfo &Res) { IO.mapRequired("Type", Res.Type); @@ -266,12 +272,6 @@ void ScalarEnumerationTraits::enumeration( IO.enumCase(Value, E.Name.str().c_str(), E.Value); } -void ScalarEnumerationTraits::enumeration( - IO &IO, dxbc::PSV::ResourceFlag &Value) { - for (const auto &E : dxbc::PSV::getResourceFlags()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); -} - void ScalarEnumerationTraits::enumeration( IO &IO, dxbc::D3DSystemValue &Value) { for (const auto &E : dxbc::getD3DSystemValues()) diff --git a/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml b/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml index f3cfa90d1cf901..1a2f341f03ef71 100644 --- a/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml +++ b/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml @@ -75,7 +75,8 @@ Parts: LowerBound: 0 UpperBound: 0 Kind: CBuffer - Flags: 0 + Flags: + UsedByAtomic64: true SigInputElements: - Name: AAA_HSFoo Indices: [ 0 ] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml index 8bae742b573919..1e00e604f3e248 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml @@ -30,13 +30,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: false SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -77,13 +79,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml index 74eb2b86ad01b2..c8bfd9acf68efc 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml @@ -29,13 +29,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: false SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -75,13 +77,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml index 38f81bd93d67cf..021fb1b5fffb1f 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml @@ -33,13 +33,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: false SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -84,13 +86,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml index 99fdbbb7c9edaf..74e32efbe2c659 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml @@ -34,13 +34,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: false SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -85,13 +87,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml index de8af95dbcbd89..79d92e2f0c5e6f 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml @@ -34,13 +34,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: false SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -86,13 +88,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml index 78fc077348f42a..27bf148126005b 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml @@ -36,13 +36,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: false SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -89,13 +91,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml index ebe1e51faff3f8..1a1a74d7f3121d 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml @@ -31,13 +31,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: false SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -79,13 +81,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml index 2bca2f211136b2..6b0ba5eb3d19f0 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml @@ -30,13 +30,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: false SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -77,13 +79,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml index 9e31d40ec7c1b4..6f7d151b266c9d 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml @@ -31,13 +31,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: false SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -79,13 +81,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml index 530a8597cb6498..2de3d435af1de9 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml @@ -30,13 +30,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: false SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -77,13 +79,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml index a71ab67633eb6f..91afb2f11fc7c4 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml @@ -34,13 +34,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: false SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -86,13 +88,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml index db530253c6a745..f661e81fe869b9 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml @@ -35,13 +35,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: false SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -87,13 +89,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml index 3e3ba493e98450..4140c3180e32ca 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml @@ -35,13 +35,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: false SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -88,13 +90,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml index 57bbcecfa1796b..03ce5b583315d0 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml @@ -37,13 +37,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: false SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -91,13 +93,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml index c94c234142a34b..2434567b2a6f5c 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml @@ -32,13 +32,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: false SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -81,13 +83,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml index 697fa870c2257c..b43f6aa6b71d4a 100644 --- a/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml +++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml @@ -31,13 +31,15 @@ Parts: LowerBound: 3 UpperBound: 4 Kind: TextureCube - Flags: 0 + Flags: + UsedByAtomic64: false - Type: Invalid Space: 32768 LowerBound: 8388608 UpperBound: 2147483648 Kind: Invalid - Flags: 0 + Flags: + UsedByAtomic64: true SigInputElements: [] SigOutputElements: [] SigPatchOrPrimElements: [] @@ -79,13 +81,15 @@ Parts: # CHECK-NEXT: LowerBound: 3 # CHECK-NEXT: UpperBound: 4 # CHECK-NEXT: Kind: TextureCube -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: false # CHECK-NEXT: - Type: Invalid # CHECK-NEXT: Space: 32768 # CHECK-NEXT: LowerBound: 8388608 # CHECK-NEXT: UpperBound: 2147483648 # CHECK-NEXT: Kind: Invalid -# CHECK-NEXT: Flags: 0 +# CHECK-NEXT: Flags: +# CHECK-NEXT: UsedByAtomic64: true # CHECK-NEXT: SigInputElements: [] # CHECK-NEXT: SigOutputElements: [] # CHECK-NEXT: SigPatchOrPrimElements: [] From 4b553f4916180ac46c250b2625c5ee6f64b26533 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 30 Aug 2024 10:08:51 -0700 Subject: [PATCH 70/98] Regen a bunch of vectorizer tests to avoid naming churn in upcoming review --- .../AArch64/sve-interleaved-accesses.ll | 4 +- .../PowerPC/widened-massv-vfabi-attr.ll | 16 +- .../X86/imprecise-through-phis.ll | 44 ++--- .../LoopVectorize/reduction-predselect.ll | 2 +- .../SLPVectorizer/X86/dot-product.ll | 176 +++++++++--------- .../X86/redux-feed-buildvector.ll | 20 +- .../X86/redux-feed-insertelement.ll | 8 +- .../SLPVectorizer/X86/slp-fma-loss.ll | 8 +- 8 files changed, 139 insertions(+), 139 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index 9641dd7d21fd2a..852a967e764819 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -1521,10 +1521,10 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 { ; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP33:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV2:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[I1:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP33]] to i32 ; CHECK-NEXT: [[I1]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[IV1:%.*]] = or disjoint i64 [[IV]], 1 ; CHECK-NEXT: [[IV2]] = add nuw nsw i64 [[IV]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll index aafe849b7042ab..e3af831f83c970 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll @@ -7,19 +7,19 @@ define dso_local double @test(ptr %Arr) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[ARR:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double> -; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x double> @__sind2_P8(<2 x double> [[TMP3]]) -; CHECK-NEXT: [[TMP5]] = fadd fast <2 x double> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double> +; CHECK-NEXT: [[TMP3:%.*]] = call fast <2 x double> @__sind2_P8(<2 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP4]] = fadd fast <2 x double> [[VEC_PHI]], [[TMP3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP5]]) -; CHECK-NEXT: ret double [[TMP7]] +; CHECK-NEXT: [[TMP6:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP4]]) +; CHECK-NEXT: ret double [[TMP6]] ; entry: br label %for.cond diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll index 7752af558f7d61..7cf4070f76d76e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll @@ -84,20 +84,20 @@ define double @sumIfVector(ptr nocapture readonly %arr) { ; SSE-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP5]], align 8 ; SSE-NEXT: [[TMP6:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], ; SSE-NEXT: [[TMP7:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD2]], -; SSE-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]] -; SSE-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[VEC_PHI1]], [[WIDE_LOAD2]] -; SSE-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP6]], <2 x double> [[TMP10]], <2 x double> [[VEC_PHI]] -; SSE-NEXT: [[PREDPHI3]] = select <2 x i1> [[TMP7]], <2 x double> [[TMP11]], <2 x double> [[VEC_PHI1]] +; SSE-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]] +; SSE-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[VEC_PHI1]], [[WIDE_LOAD2]] +; SSE-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP6]], <2 x double> [[TMP8]], <2 x double> [[VEC_PHI]] +; SSE-NEXT: [[PREDPHI3]] = select <2 x i1> [[TMP7]], <2 x double> [[TMP9]], <2 x double> [[VEC_PHI1]] ; SSE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; SSE-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 -; SSE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SSE-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 +; SSE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SSE: middle.block: ; SSE-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[PREDPHI3]], [[PREDPHI]] -; SSE-NEXT: [[TMP13:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[BIN_RDX]]) +; SSE-NEXT: [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[BIN_RDX]]) ; SSE-NEXT: br i1 true, label [[DONE:%.*]], label [[SCALAR_PH]] ; SSE: scalar.ph: ; SSE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SSE-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; SSE-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] ; SSE-NEXT: br label [[LOOP:%.*]] ; SSE: loop: ; SSE-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] @@ -117,7 +117,7 @@ define double @sumIfVector(ptr nocapture readonly %arr) { ; SSE-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32 ; SSE-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP3:![0-9]+]] ; SSE: done: -; SSE-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; SSE-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; SSE-NEXT: ret double [[TOT_NEXT_LCSSA]] ; ; AVX-LABEL: @sumIfVector( @@ -151,26 +151,26 @@ define double @sumIfVector(ptr nocapture readonly %arr) { ; AVX-NEXT: [[TMP13:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD4]], ; AVX-NEXT: [[TMP14:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD5]], ; AVX-NEXT: [[TMP15:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD6]], -; AVX-NEXT: [[TMP20:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]] -; AVX-NEXT: [[TMP21:%.*]] = fadd fast <4 x double> [[VEC_PHI1]], [[WIDE_LOAD4]] -; AVX-NEXT: [[TMP22:%.*]] = fadd fast <4 x double> [[VEC_PHI2]], [[WIDE_LOAD5]] -; AVX-NEXT: [[TMP23:%.*]] = fadd fast <4 x double> [[VEC_PHI3]], [[WIDE_LOAD6]] -; AVX-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP12]], <4 x double> [[TMP20]], <4 x double> [[VEC_PHI]] -; AVX-NEXT: [[PREDPHI7]] = select <4 x i1> [[TMP13]], <4 x double> [[TMP21]], <4 x double> [[VEC_PHI1]] -; AVX-NEXT: [[PREDPHI8]] = select <4 x i1> [[TMP14]], <4 x double> [[TMP22]], <4 x double> [[VEC_PHI2]] -; AVX-NEXT: [[PREDPHI9]] = select <4 x i1> [[TMP15]], <4 x double> [[TMP23]], <4 x double> [[VEC_PHI3]] +; AVX-NEXT: [[TMP16:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]] +; AVX-NEXT: [[TMP17:%.*]] = fadd fast <4 x double> [[VEC_PHI1]], [[WIDE_LOAD4]] +; AVX-NEXT: [[TMP18:%.*]] = fadd fast <4 x double> [[VEC_PHI2]], [[WIDE_LOAD5]] +; AVX-NEXT: [[TMP19:%.*]] = fadd fast <4 x double> [[VEC_PHI3]], [[WIDE_LOAD6]] +; AVX-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP12]], <4 x double> [[TMP16]], <4 x double> [[VEC_PHI]] +; AVX-NEXT: [[PREDPHI7]] = select <4 x i1> [[TMP13]], <4 x double> [[TMP17]], <4 x double> [[VEC_PHI1]] +; AVX-NEXT: [[PREDPHI8]] = select <4 x i1> [[TMP14]], <4 x double> [[TMP18]], <4 x double> [[VEC_PHI2]] +; AVX-NEXT: [[PREDPHI9]] = select <4 x i1> [[TMP15]], <4 x double> [[TMP19]], <4 x double> [[VEC_PHI3]] ; AVX-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 -; AVX-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 -; AVX-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 +; AVX-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX: middle.block: ; AVX-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[PREDPHI7]], [[PREDPHI]] ; AVX-NEXT: [[BIN_RDX10:%.*]] = fadd fast <4 x double> [[PREDPHI8]], [[BIN_RDX]] ; AVX-NEXT: [[BIN_RDX11:%.*]] = fadd fast <4 x double> [[PREDPHI9]], [[BIN_RDX10]] -; AVX-NEXT: [[TMP25:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX11]]) +; AVX-NEXT: [[TMP21:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX11]]) ; AVX-NEXT: br i1 true, label [[DONE:%.*]], label [[SCALAR_PH]] ; AVX: scalar.ph: ; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; AVX-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP25]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; AVX-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] ; AVX-NEXT: br label [[LOOP:%.*]] ; AVX: loop: ; AVX-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] @@ -190,7 +190,7 @@ define double @sumIfVector(ptr nocapture readonly %arr) { ; AVX-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32 ; AVX-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP3:![0-9]+]] ; AVX: done: -; AVX-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; AVX-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] ; AVX-NEXT: ret double [[TOT_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll index 7fd762c7b735a0..40383c7e551bcf 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll @@ -65,7 +65,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) { ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: -; CHECK-NEXT: br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: ._crit_edge: ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ poison, [[DOTLR_PH]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll index af27572cfeaef8..4352b3d0c80d32 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll @@ -12,19 +12,19 @@ define double @dot4f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %p ; CHECK-LABEL: @dot4f64( ; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 2 ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRX]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[PTRY]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP7]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 -; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP10]], i32 0 -; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[TMP10]], i32 1 -; CHECK-NEXT: [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP14]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +; CHECK-NEXT: [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP10]] ; CHECK-NEXT: ret double [[DOT0123]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 @@ -55,19 +55,19 @@ define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt ; CHECK-LABEL: @dot4f32( ; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 2 ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRX]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[PTRY]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP7]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 -; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 -; CHECK-NEXT: [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 -; CHECK-NEXT: [[DOT0123:%.*]] = fadd float [[DOT012]], [[TMP14]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x float> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 +; CHECK-NEXT: [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 +; CHECK-NEXT: [[DOT0123:%.*]] = fadd float [[DOT012]], [[TMP10]] ; CHECK-NEXT: ret float [[DOT0123]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 @@ -96,11 +96,11 @@ define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) { ; CHECK-LABEL: @dot4f64_fast( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP5]]) -; CHECK-NEXT: ret double [[TMP6]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP3]]) +; CHECK-NEXT: ret double [[TMP4]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1 @@ -128,11 +128,11 @@ define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(3 define float @dot4f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { ; CHECK-LABEL: @dot4f32_fast( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) -; CHECK-NEXT: ret float [[TMP6]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]]) +; CHECK-NEXT: ret float [[TMP4]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1 @@ -169,13 +169,13 @@ define double @dot3f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %p ; CHECK-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4 ; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 -; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP5]] ; CHECK-NEXT: ret double [[DOT012]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 @@ -203,13 +203,13 @@ define float @dot3f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt ; CHECK-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4 ; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 -; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[MUL0]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 -; CHECK-NEXT: [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[MUL0]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP5]] ; CHECK-NEXT: ret float [[DOT012]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 @@ -237,13 +237,13 @@ define double @dot3f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(3 ; CHECK-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4 ; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 -; CHECK-NEXT: [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; CHECK-NEXT: [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP5]] ; CHECK-NEXT: ret double [[DOT012]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 @@ -271,13 +271,13 @@ define float @dot3f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16 ; CHECK-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4 ; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 -; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 -; CHECK-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]] ; CHECK-NEXT: ret float [[DOT012]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 @@ -304,12 +304,12 @@ define float @dot3f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16 define double @dot2f64(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { ; CHECK-LABEL: @dot2f64( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 -; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[TMP4]], [[TMP5]] ; CHECK-NEXT: ret double [[DOT01]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 @@ -326,12 +326,12 @@ define double @dot2f64(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %p define float @dot2f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { ; CHECK-LABEL: @dot2f32( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 -; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[TMP4]], [[TMP5]] ; CHECK-NEXT: ret float [[DOT01]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 @@ -348,12 +348,12 @@ define float @dot2f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { ; CHECK-LABEL: @dot2f64_fast( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 -; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]] ; CHECK-NEXT: ret double [[DOT01]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 @@ -370,12 +370,12 @@ define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(1 define float @dot2f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { ; CHECK-LABEL: @dot2f32_fast( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 -; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]] ; CHECK-NEXT: ret float [[DOT01]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll index 83457cc4966f7c..ebd35448ba72f7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll @@ -11,18 +11,18 @@ define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, <8 x ptr> [[SHUFFLE]], <8 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> ; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16 -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> , <8 x double> poison) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> , <8 x double> poison) ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = load <8 x double>, ptr [[ARG1]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <8 x double> [[TMP7]], [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP8]]) -; CHECK-NEXT: [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]]) -; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i64 0 -; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP10]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]]) +; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0 +; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1 ; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> ; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> ) ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll index 19d0bc9b330657..20c5bda328c100 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll @@ -6,10 +6,10 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg define void @rdx_feeds_single_insert(<2 x double> %v, ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) { ; CHECK-LABEL: @rdx_feeds_single_insert( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr [[ARG1:%.*]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <8 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP2]]) -; CHECK-NEXT: [[I:%.*]] = insertelement <2 x double> [[V:%.*]], double [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x double>, ptr [[ARG1:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <8 x double> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP1]]) +; CHECK-NEXT: [[I:%.*]] = insertelement <2 x double> [[V:%.*]], double [[TMP2]], i64 1 ; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> ; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I]], <2 x ptr> [[P]], i32 8, <2 x i1> ) ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll index 02c7e4a03325ed..1f3c0fb9e297c5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll @@ -54,10 +54,10 @@ define double @hr_or_mul() { ; CHECK-LABEL: @hr_or_mul( ; CHECK-NEXT: [[CVT0:%.*]] = uitofp i16 3 to double ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> , [[SHUFFLE]] -; CHECK-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast double [[TMP3]], [[CVT0]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x double> , [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP3]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]] ; CHECK-NEXT: ret double [[OP_RDX]] ; %cvt0 = uitofp i16 3 to double From 5af4ba2684b9b59de3bf8135f62e05ab68cfc489 Mon Sep 17 00:00:00 2001 From: Harini0924 Date: Fri, 30 Aug 2024 10:15:21 -0700 Subject: [PATCH 71/98] Revert "[llvm-lit] Add precommit test to verify current behavior of glob expansion in lit's internal shell" (#106763) Reverts llvm/llvm-project#106325 Broke some Buildbots. --- .../lit/tests/Inputs/shtest-glob/example_file1.input | 2 -- .../lit/tests/Inputs/shtest-glob/example_file2.input | 2 -- .../utils/lit/tests/Inputs/shtest-glob/glob-echo.txt | 2 -- .../lit/tests/Inputs/shtest-glob/glob-mkdir.txt | 2 -- llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg | 8 -------- llvm/utils/lit/tests/shtest-glob.py | 12 ------------ 6 files changed, 28 deletions(-) delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg delete mode 100644 llvm/utils/lit/tests/shtest-glob.py diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input b/llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input deleted file mode 100644 index 0987c9081ca1f3..00000000000000 --- a/llvm/utils/lit/tests/Inputs/shtest-glob/example_file1.input +++ /dev/null @@ -1,2 +0,0 @@ -## This is the first example file used for testing glob pattern matching. -This is the first example file. diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input b/llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input deleted file mode 100644 index f1a843f308262e..00000000000000 --- a/llvm/utils/lit/tests/Inputs/shtest-glob/example_file2.input +++ /dev/null @@ -1,2 +0,0 @@ -## This is the second example file used for testing glob pattern matching. -This is the second example file. diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt deleted file mode 100644 index b69f5e74fd7281..00000000000000 --- a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-echo.txt +++ /dev/null @@ -1,2 +0,0 @@ -## Tests glob pattern expansion by listing matching files. -# RUN: echo %S/example_file*.input diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt deleted file mode 100644 index d1329f5dbfaaed..00000000000000 --- a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt +++ /dev/null @@ -1,2 +0,0 @@ -## Tests glob pattern handling in the mkdir command. -# RUN: not mkdir %S/example_file*.input diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg deleted file mode 100644 index 4e5f4cac4c4653..00000000000000 --- a/llvm/utils/lit/tests/Inputs/shtest-glob/lit.cfg +++ /dev/null @@ -1,8 +0,0 @@ -import lit.formats - -config.name = "shtest-glob" -config.suffixes = [".txt"] -config.test_format = lit.formats.ShTest() -config.test_source_root = None -config.test_exec_root = None -config.substitutions.append(("%{python}", '"%s"' % (sys.executable))) diff --git a/llvm/utils/lit/tests/shtest-glob.py b/llvm/utils/lit/tests/shtest-glob.py deleted file mode 100644 index 551331cb38e259..00000000000000 --- a/llvm/utils/lit/tests/shtest-glob.py +++ /dev/null @@ -1,12 +0,0 @@ -## Tests glob pattern handling in echo command. - -# RUN: not %{lit} -a -v %{inputs}/shtest-glob \ -# RUN: | FileCheck -dump-input=fail -match-full-lines %s -# -# END. - -# CHECK: UNRESOLVED: shtest-glob :: glob-echo.txt ({{[^)]*}}) -# CHECK: TypeError: string argument expected, got 'GlobItem' - -# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}} -# CHECK: # error: command failed with exit status: 1 From 5500e21942f7047344b6fee62d3e08c0ba2f9182 Mon Sep 17 00:00:00 2001 From: Walter Erquinigo Date: Fri, 30 Aug 2024 13:18:23 -0400 Subject: [PATCH 72/98] Revert "[LLDB][DWARF] Add an option to silence unsupported DW_FORM warnings" (#106765) Reverts llvm/llvm-project#106609 --- .../SymbolFile/DWARF/SymbolFileDWARF.cpp | 30 +++++++------------ .../DWARF/SymbolFileDWARFProperties.td | 4 --- 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index 2af6dc880842a4..ff44329d081caa 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -87,7 +87,7 @@ #include #include -// #define ENABLE_DEBUG_PRINTF // COMMENT OUT THIS LINE PRIOR TO CHECKIN +//#define ENABLE_DEBUG_PRINTF // COMMENT OUT THIS LINE PRIOR TO CHECKIN #ifdef ENABLE_DEBUG_PRINTF #include @@ -129,11 +129,6 @@ class PluginProperties : public Properties { bool IgnoreFileIndexes() const { return GetPropertyAtIndexAs(ePropertyIgnoreIndexes, false); } - - bool EmitUnsupportedDWFormValueWarning() const { - return GetPropertyAtIndexAs( - ePropertyEmitUnsupportedDWFormValueWarning, true); - } }; } // namespace @@ -629,14 +624,12 @@ uint32_t SymbolFileDWARF::CalculateAbilities() { llvm::DWARFDebugAbbrev *abbrev = DebugAbbrev(); std::set unsupported_forms = GetUnsupportedForms(abbrev); if (!unsupported_forms.empty()) { - if (GetGlobalPluginProperties().EmitUnsupportedDWFormValueWarning()) { - StreamString error; - error.Printf("unsupported DW_FORM value%s:", - unsupported_forms.size() > 1 ? "s" : ""); - for (auto form : unsupported_forms) - error.Printf(" %#x", form); - m_objfile_sp->GetModule()->ReportWarning("{0}", error.GetString()); - } + StreamString error; + error.Printf("unsupported DW_FORM value%s:", + unsupported_forms.size() > 1 ? "s" : ""); + for (auto form : unsupported_forms) + error.Printf(" %#x", form); + m_objfile_sp->GetModule()->ReportWarning("{0}", error.GetString()); return 0; } @@ -1777,17 +1770,16 @@ SymbolFileDWARF *SymbolFileDWARF::GetDIERefSymbolFile(const DIERef &die_ref) { return this; if (file_index) { - // We have a SymbolFileDWARFDebugMap, so let it find the right file + // We have a SymbolFileDWARFDebugMap, so let it find the right file if (SymbolFileDWARFDebugMap *debug_map = GetDebugMapSymfile()) return debug_map->GetSymbolFileByOSOIndex(*file_index); - + // Handle the .dwp file case correctly if (*file_index == DIERef::k_file_index_mask) return GetDwpSymbolFile().get(); // DWP case // Handle the .dwo file case correctly - return DebugInfo() - .GetUnitAtIndex(*die_ref.file_index()) + return DebugInfo().GetUnitAtIndex(*die_ref.file_index()) ->GetDwoSymbolFile(); // DWO case } return this; @@ -3629,7 +3621,7 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc, lldb::addr_t location_DW_OP_addr = LLDB_INVALID_ADDRESS; if (!location_is_const_value_data) { bool op_error = false; - const DWARFExpression *location = location_list.GetAlwaysValidExpr(); + const DWARFExpression* location = location_list.GetAlwaysValidExpr(); if (location) location_DW_OP_addr = location->GetLocation_DW_OP_addr(location_form.GetUnit(), op_error); diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFProperties.td b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFProperties.td index 0f980a514b6720..2f1ce88808b763 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFProperties.td +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFProperties.td @@ -5,8 +5,4 @@ let Definition = "symbolfiledwarf" in { Global, DefaultFalse, Desc<"Ignore indexes present in the object files and always index DWARF manually.">; - def EmitUnsupportedDWFormValueWarning: Property<"emit-unsupported-dwform-value", "Boolean">, - Global, - DefaultTrue, - Desc<"Emit warnings about unsupported DW_Form values.">; } From a4aa6bc8fc2130761b8db5db4748059127662785 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 30 Aug 2024 10:17:31 -0700 Subject: [PATCH 73/98] [SLP]Fix PR106667: carefully look for operand nodes. If the operand node has the same scalars as one of the vectorized nodes, the compiler could miss this and incorrectly request minbitwidth data for the wrong node. It may lead to a compiler crash, because the vectorized node might have different minbw result. Fixes https://github.com/llvm/llvm-project/issues/106667 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 333 +++++++++--------- .../X86/multi-nodes-bv-vectorized.ll | 44 +++ 2 files changed, 208 insertions(+), 169 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-bv-vectorized.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4c0a1c4c094b95..e9785ef9ded2d5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2864,6 +2864,12 @@ class BoUpSLP { /// avoid issues with def-use order. Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs); + TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx); + const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, + unsigned NodeIdx) const { + return const_cast(this)->getMatchedVectorizedOperand(E, NodeIdx); + } + /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry /// \p E. /// \param PostponedPHIs true, if need to postpone emission of phi nodes to @@ -6964,6 +6970,55 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } } + // Check if this is a duplicate of another entry. + if (TreeEntry *E = getTreeEntry(S.OpValue)) { + LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); + if (!E->isSame(VL)) { + auto It = MultiNodeScalars.find(S.OpValue); + if (It != MultiNodeScalars.end()) { + auto *TEIt = find_if(It->getSecond(), + [&](TreeEntry *ME) { return ME->isSame(VL); }); + if (TEIt != It->getSecond().end()) + E = *TEIt; + else + E = nullptr; + } else { + E = nullptr; + } + } + if (!E) { + if (!doesNotNeedToBeScheduled(S.OpValue)) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndices); + return; + } + SmallPtrSet Nodes; + Nodes.insert(getTreeEntry(S.OpValue)); + for (const TreeEntry *E : MultiNodeScalars.lookup(S.OpValue)) + Nodes.insert(E); + SmallPtrSet Values(VL.begin(), VL.end()); + if (any_of(Nodes, [&](const TreeEntry *E) { + return all_of(E->Scalars, + [&](Value *V) { return Values.contains(V); }); + })) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n"); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndices); + return; + } + } else { + // Record the reuse of the tree node. FIXME, currently this is only used + // to properly draw the graph rather than for the actual vectorization. + E->UserTreeIndices.push_back(UserTreeIdx); + LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue + << ".\n"); + return; + } + } + // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of // a load), in which case peek through to include it in the tree, without // ballooning over-budget. @@ -7095,55 +7150,6 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // We now know that this is a vector of instructions of the same type from // the same block. - // Check if this is a duplicate of another entry. - if (TreeEntry *E = getTreeEntry(S.OpValue)) { - LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); - if (!E->isSame(VL)) { - auto It = MultiNodeScalars.find(S.OpValue); - if (It != MultiNodeScalars.end()) { - auto *TEIt = find_if(It->getSecond(), - [&](TreeEntry *ME) { return ME->isSame(VL); }); - if (TEIt != It->getSecond().end()) - E = *TEIt; - else - E = nullptr; - } else { - E = nullptr; - } - } - if (!E) { - if (!doesNotNeedToBeScheduled(S.OpValue)) { - LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - if (TryToFindDuplicates(S)) - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndices); - return; - } - SmallPtrSet Nodes; - Nodes.insert(getTreeEntry(S.OpValue)); - for (const TreeEntry *E : MultiNodeScalars.lookup(S.OpValue)) - Nodes.insert(E); - SmallPtrSet Values(VL.begin(), VL.end()); - if (any_of(Nodes, [&](const TreeEntry *E) { - return all_of(E->Scalars, - [&](Value *V) { return Values.contains(V); }); - })) { - LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n"); - if (TryToFindDuplicates(S)) - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndices); - return; - } - } else { - // Record the reuse of the tree node. FIXME, currently this is only used - // to properly draw the graph rather than for the actual vectorization. - E->UserTreeIndices.push_back(UserTreeIdx); - LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue - << ".\n"); - return; - } - } - // Check that none of the instructions in the bundle are already in the tree. for (Value *V : VL) { if ((!IsScatterVectorizeUserTE && !isa(V)) || @@ -9362,22 +9368,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E, unsigned Idx) const { - Value *Op = E->getOperand(Idx).front(); - if (const TreeEntry *TE = getTreeEntry(Op)) { - if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) { - return EI.EdgeIdx == Idx && EI.UserTE == E; - }) != TE->UserTreeIndices.end()) - return TE; - auto MIt = MultiNodeScalars.find(Op); - if (MIt != MultiNodeScalars.end()) { - for (const TreeEntry *TE : MIt->second) { - if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) { - return EI.EdgeIdx == Idx && EI.UserTE == E; - }) != TE->UserTreeIndices.end()) - return TE; - } - } - } + if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx)) + return VE; const auto *It = find_if(VectorizableTree, [&](const std::unique_ptr &TE) { return TE->isGather() && @@ -12521,10 +12513,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { } }; -Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, - bool PostponedPHIs) { - ValueList &VL = E->getOperand(NodeIdx); - const unsigned VF = VL.size(); +BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, + unsigned NodeIdx) { + ArrayRef VL = E->getOperand(NodeIdx); InstructionsState S = getSameOpcode(VL, *TLI); // Special processing for GEPs bundle, which may include non-gep values. if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) { @@ -12532,109 +12523,113 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, if (It != VL.end()) S = getSameOpcode(*It, *TLI); } - if (S.getOpcode()) { - auto CheckSameVE = [&](const TreeEntry *VE) { - return VE->isSame(VL) && - (any_of(VE->UserTreeIndices, - [E, NodeIdx](const EdgeInfo &EI) { - return EI.UserTE == E && EI.EdgeIdx == NodeIdx; - }) || - any_of(VectorizableTree, - [E, NodeIdx, VE](const std::unique_ptr &TE) { - return TE->isOperandGatherNode({E, NodeIdx}) && - VE->isSame(TE->Scalars); - })); + if (!S.getOpcode()) + return nullptr; + auto CheckSameVE = [&](const TreeEntry *VE) { + return VE->isSame(VL) && + (any_of(VE->UserTreeIndices, + [E, NodeIdx](const EdgeInfo &EI) { + return EI.UserTE == E && EI.EdgeIdx == NodeIdx; + }) || + any_of(VectorizableTree, + [E, NodeIdx, VE](const std::unique_ptr &TE) { + return TE->isOperandGatherNode( + {const_cast(E), NodeIdx}) && + VE->isSame(TE->Scalars); + })); + }; + TreeEntry *VE = getTreeEntry(S.OpValue); + if (VE && CheckSameVE(VE)) + return VE; + auto It = MultiNodeScalars.find(S.OpValue); + if (It != MultiNodeScalars.end()) { + auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) { + return TE != VE && CheckSameVE(TE); + }); + if (I != It->getSecond().end()) + return *I; + } + return nullptr; +} + +Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, + bool PostponedPHIs) { + ValueList &VL = E->getOperand(NodeIdx); + const unsigned VF = VL.size(); + if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) { + auto FinalShuffle = [&](Value *V, ArrayRef Mask) { + // V may be affected by MinBWs. + // We want ShuffleInstructionBuilder to correctly support REVEC. The key + // factor is the number of elements, not their type. + Type *ScalarTy = cast(V->getType())->getElementType(); + unsigned NumElements = getNumElements(VL.front()->getType()); + ShuffleInstructionBuilder ShuffleBuilder( + NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements) + : ScalarTy, + Builder, *this); + ShuffleBuilder.add(V, Mask); + SmallVector> SubVectors( + E->CombinedEntriesWithIndices.size()); + transform(E->CombinedEntriesWithIndices, SubVectors.begin(), + [&](const auto &P) { + return std::make_pair(VectorizableTree[P.first].get(), + P.second); + }); + return ShuffleBuilder.finalize(std::nullopt, SubVectors); }; - TreeEntry *VE = getTreeEntry(S.OpValue); - bool IsSameVE = VE && CheckSameVE(VE); - if (!IsSameVE) { - auto It = MultiNodeScalars.find(S.OpValue); - if (It != MultiNodeScalars.end()) { - auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) { - return TE != VE && CheckSameVE(TE); - }); - if (I != It->getSecond().end()) { - VE = *I; - IsSameVE = true; - } - } - } - if (IsSameVE) { - auto FinalShuffle = [&](Value *V, ArrayRef Mask) { - // V may be affected by MinBWs. - // We want ShuffleInstructionBuilder to correctly support REVEC. The key - // factor is the number of elements, not their type. - Type *ScalarTy = cast(V->getType())->getElementType(); - unsigned NumElements = getNumElements(VL.front()->getType()); - ShuffleInstructionBuilder ShuffleBuilder( - NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements) - : ScalarTy, - Builder, *this); - ShuffleBuilder.add(V, Mask); - SmallVector> SubVectors( - E->CombinedEntriesWithIndices.size()); - transform(E->CombinedEntriesWithIndices, SubVectors.begin(), - [&](const auto &P) { - return std::make_pair(VectorizableTree[P.first].get(), - P.second); - }); - return ShuffleBuilder.finalize(std::nullopt, SubVectors); - }; - Value *V = vectorizeTree(VE, PostponedPHIs); - if (VF * getNumElements(VL[0]->getType()) != - cast(V->getType())->getNumElements()) { - if (!VE->ReuseShuffleIndices.empty()) { - // Reshuffle to get only unique values. - // If some of the scalars are duplicated in the vectorization - // tree entry, we do not vectorize them but instead generate a - // mask for the reuses. But if there are several users of the - // same entry, they may have different vectorization factors. - // This is especially important for PHI nodes. In this case, we - // need to adapt the resulting instruction for the user - // vectorization factor and have to reshuffle it again to take - // only unique elements of the vector. Without this code the - // function incorrectly returns reduced vector instruction with - // the same elements, not with the unique ones. - - // block: - // %phi = phi <2 x > { .., %entry} {%shuffle, %block} - // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0> - // ... (use %2) - // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0} - // br %block - SmallVector Mask(VF, PoisonMaskElem); - for (auto [I, V] : enumerate(VL)) { - if (isa(V)) - continue; - Mask[I] = VE->findLaneForValue(V); - } - V = FinalShuffle(V, Mask); - } else { - assert(VF < cast(V->getType())->getNumElements() && - "Expected vectorization factor less " - "than original vector size."); - SmallVector UniformMask(VF, 0); - std::iota(UniformMask.begin(), UniformMask.end(), 0); - V = FinalShuffle(V, UniformMask); + Value *V = vectorizeTree(VE, PostponedPHIs); + if (VF * getNumElements(VL[0]->getType()) != + cast(V->getType())->getNumElements()) { + if (!VE->ReuseShuffleIndices.empty()) { + // Reshuffle to get only unique values. + // If some of the scalars are duplicated in the vectorization + // tree entry, we do not vectorize them but instead generate a + // mask for the reuses. But if there are several users of the + // same entry, they may have different vectorization factors. + // This is especially important for PHI nodes. In this case, we + // need to adapt the resulting instruction for the user + // vectorization factor and have to reshuffle it again to take + // only unique elements of the vector. Without this code the + // function incorrectly returns reduced vector instruction with + // the same elements, not with the unique ones. + + // block: + // %phi = phi <2 x > { .., %entry} {%shuffle, %block} + // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0> + // ... (use %2) + // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0} + // br %block + SmallVector Mask(VF, PoisonMaskElem); + for (auto [I, V] : enumerate(VL)) { + if (isa(V)) + continue; + Mask[I] = VE->findLaneForValue(V); } - } - // Need to update the operand gather node, if actually the operand is not a - // vectorized node, but the buildvector/gather node, which matches one of - // the vectorized nodes. - if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) { - return EI.UserTE == E && EI.EdgeIdx == NodeIdx; - }) == VE->UserTreeIndices.end()) { - auto *It = find_if( - VectorizableTree, [&](const std::unique_ptr &TE) { - return TE->isGather() && - TE->UserTreeIndices.front().UserTE == E && - TE->UserTreeIndices.front().EdgeIdx == NodeIdx; - }); - assert(It != VectorizableTree.end() && "Expected gather node operand."); - (*It)->VectorizedValue = V; - } - return V; + V = FinalShuffle(V, Mask); + } else { + assert(VF < cast(V->getType())->getNumElements() && + "Expected vectorization factor less " + "than original vector size."); + SmallVector UniformMask(VF, 0); + std::iota(UniformMask.begin(), UniformMask.end(), 0); + V = FinalShuffle(V, UniformMask); + } + } + // Need to update the operand gather node, if actually the operand is not a + // vectorized node, but the buildvector/gather node, which matches one of + // the vectorized nodes. + if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) { + return EI.UserTE == E && EI.EdgeIdx == NodeIdx; + }) == VE->UserTreeIndices.end()) { + auto *It = + find_if(VectorizableTree, [&](const std::unique_ptr &TE) { + return TE->isGather() && TE->UserTreeIndices.front().UserTE == E && + TE->UserTreeIndices.front().EdgeIdx == NodeIdx; + }); + assert(It != VectorizableTree.end() && "Expected gather node operand."); + (*It)->VectorizedValue = V; } + return V; } // Find the corresponding gather entry and vectorize it. diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-bv-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-bv-vectorized.ll new file mode 100644 index 00000000000000..c44ef376f81fab --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-bv-vectorized.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define void @test(ptr %p) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16 +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr [[GEP1]], align 16 +; CHECK-NEXT: ret void +; +entry: + %conv548.2.i.13 = zext i32 0 to i64 + %and551.2.i.13 = and i64 0, %conv548.2.i.13 + %conv548.3.i.13 = zext i32 0 to i64 + %and551.3.i.13 = and i64 0, %conv548.3.i.13 + %0 = trunc i64 %and551.2.i.13 to i32 + %conv54.2.i.14 = and i32 %0, 0 + %conv548.2.i.14 = zext i32 %conv54.2.i.14 to i64 + %and551.2.i.14 = and i64 %and551.2.i.13, %conv548.2.i.14 + %1 = trunc i64 %and551.3.i.13 to i32 + %conv54.3.i.14 = and i32 %1, 0 + %conv548.3.i.14 = zext i32 %conv54.3.i.14 to i64 + %and551.3.i.14 = and i64 %and551.3.i.13, %conv548.3.i.14 + %and551.2.i.15 = and i64 %and551.2.i.14, 0 + %and551.3.i.15 = and i64 %and551.3.i.14, 0 + %and551.2.i.16 = and i64 %and551.2.i.15, 0 + %and551.3.i.16 = and i64 %and551.3.i.15, 0 + %and551.2.i.17 = and i64 %and551.2.i.16, 0 + %and551.3.i.17 = and i64 %and551.3.i.16, 0 + %and551.2.i.18 = and i64 %and551.2.i.17, 0 + %and551.3.i.18 = and i64 %and551.3.i.17, 0 + %and551.2.i.19 = and i64 %and551.2.i.18, 0 + %and551.3.i.19 = and i64 %and551.3.i.18, 0 + %and551.2.i.20 = and i64 %and551.2.i.19, 0 + %and551.3.i.20 = and i64 %and551.3.i.19, 0 + %and551.2.i.21 = and i64 %and551.2.i.20, 0 + %and551.3.i.21 = and i64 %and551.3.i.20, 0 + %gep1 = getelementptr inbounds i8, ptr %p, i64 16 + %gep2 = getelementptr inbounds i8, ptr %p, i64 24 + store i64 %and551.2.i.21, ptr %gep1, align 16 + store i64 %and551.3.i.21, ptr %gep2, align 8 + ret void +} From 6023d17e6b6624913b85fe9d2b5d79ae681e5764 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 30 Aug 2024 10:35:10 -0700 Subject: [PATCH 74/98] [SLP][NFC]Add a function description, NFC. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e9785ef9ded2d5..f6a797b071b65c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2864,6 +2864,8 @@ class BoUpSLP { /// avoid issues with def-use order. Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs); + /// Returns vectorized operand node, that matches the order of the scalars + /// operand number \p NodeIdx in entry \p E. TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx); const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx) const { From ef7b18a53c0d186dcda1e322be6035407fdedb55 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 30 Aug 2024 18:42:54 +0100 Subject: [PATCH 75/98] [X86] Rename trailing whitespace. NFC. Noticed in clang-formatting of #106750 --- .../lib/Target/X86/X86InstCombineIntrinsic.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index 9cc5ed5d89ad70..a62fb7f723cdbc 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -3009,15 +3009,15 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { case Intrinsic::x86_avx512_vpermi2var_d_128: case Intrinsic::x86_avx512_vpermi2var_d_256: case Intrinsic::x86_avx512_vpermi2var_d_512: - case Intrinsic::x86_avx512_vpermi2var_hi_128: - case Intrinsic::x86_avx512_vpermi2var_hi_256: - case Intrinsic::x86_avx512_vpermi2var_hi_512: - case Intrinsic::x86_avx512_vpermi2var_pd_128: - case Intrinsic::x86_avx512_vpermi2var_pd_256: - case Intrinsic::x86_avx512_vpermi2var_pd_512: - case Intrinsic::x86_avx512_vpermi2var_ps_128: - case Intrinsic::x86_avx512_vpermi2var_ps_256: - case Intrinsic::x86_avx512_vpermi2var_ps_512: + case Intrinsic::x86_avx512_vpermi2var_hi_128: + case Intrinsic::x86_avx512_vpermi2var_hi_256: + case Intrinsic::x86_avx512_vpermi2var_hi_512: + case Intrinsic::x86_avx512_vpermi2var_pd_128: + case Intrinsic::x86_avx512_vpermi2var_pd_256: + case Intrinsic::x86_avx512_vpermi2var_pd_512: + case Intrinsic::x86_avx512_vpermi2var_ps_128: + case Intrinsic::x86_avx512_vpermi2var_ps_256: + case Intrinsic::x86_avx512_vpermi2var_ps_512: case Intrinsic::x86_avx512_vpermi2var_q_128: case Intrinsic::x86_avx512_vpermi2var_q_256: case Intrinsic::x86_avx512_vpermi2var_q_512: From d0d0e125a66b7c7921ad82c13c893bf592f071ba Mon Sep 17 00:00:00 2001 From: Marina Taylor Date: Fri, 30 Aug 2024 18:48:08 +0100 Subject: [PATCH 76/98] [AArch64] Fix a presumed typo in isFPImmLegal limit. NFC (#106716) The worst possible case for a double literal goes like: ``` mov ... movk ..., lsl #16 movk ..., lsl #32 movk ..., lsl #48 fmov ... ``` The limit of 5 in the code gives the impression that `Insn` includes all instructions including the `fmov`, but that's not true. It only counts the integer moves. This led me astray on some other work in this area. --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 28ad0abf25703b..11aca69db0a148 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11463,7 +11463,9 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, // movw+movk is fused). So we limit up to 2 instrdduction at most. SmallVector Insn; AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), Insn); - unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2)); + assert(Insn.size() <= 4 && + "Should be able to build any value with at most 4 moves"); + unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2)); IsLegal = Insn.size() <= Limit; } From 0efa38699a4988793cdd51426fe27f00b5e5ce37 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Sat, 31 Aug 2024 01:50:24 +0800 Subject: [PATCH 77/98] [RISCV] Check VL dominates and potentially move in tryReduceVL (#106753) Similar to what we do in foldVMV_V_V with the passthru, if we end up changing the Src's VL in tryReduceVL we need to make sure it dominates. Fixes #106735 --- llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 35 ++++++++++++++----- .../CodeGen/RISCV/rvv/reduce-vl-peephole.ll | 19 ++++++++++ .../CodeGen/RISCV/rvv/reduce-vl-peephole.mir | 15 ++++++++ 3 files changed, 60 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/reduce-vl-peephole.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/reduce-vl-peephole.mir diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index 0f8f9442877e33..6df3b951f5a06f 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -70,6 +70,7 @@ class RISCVVectorPeephole : public MachineFunctionPass { bool isAllOnesMask(const MachineInstr *MaskDef) const; std::optional getConstant(const MachineOperand &VL) const; + bool ensureDominates(const MachineOperand &Use, MachineInstr &Src) const; /// Maps uses of V0 to the corresponding def of V0. DenseMap V0Defs; @@ -165,6 +166,9 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const { if (VL.isIdenticalTo(SrcVL) || !isVLKnownLE(VL, SrcVL)) return false; + if (!ensureDominates(VL, *Src)) + return false; + if (VL.isImm()) SrcVL.ChangeToImmediate(VL.getImm()); else if (VL.isReg()) @@ -456,6 +460,26 @@ static bool dominates(MachineBasicBlock::const_iterator A, return &*I == A; } +/// If the register in \p MO doesn't dominate \p Src, try to move \p Src so it +/// does. Returns false if doesn't dominate and we can't move. \p MO must be in +/// the same basic block as \Src. +bool RISCVVectorPeephole::ensureDominates(const MachineOperand &MO, + MachineInstr &Src) const { + assert(MO.getParent()->getParent() == Src.getParent()); + if (!MO.isReg() || MO.getReg() == RISCV::NoRegister) + return true; + + MachineInstr *Def = MRI->getVRegDef(MO.getReg()); + if (Def->getParent() == Src.getParent() && !dominates(Def, Src)) { + if (!isSafeToMove(Src, *Def->getNextNode())) + return false; + // FIXME: Update V0Defs + Src.moveBefore(Def->getNextNode()); + } + + return true; +} + /// If a PseudoVMV_V_V is the only user of its input, fold its passthru and VL /// into it. /// @@ -501,15 +525,8 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { return false; // If the new passthru doesn't dominate Src, try to move Src so it does. - if (Passthru.getReg() != RISCV::NoRegister) { - MachineInstr *PassthruDef = MRI->getVRegDef(Passthru.getReg()); - if (PassthruDef->getParent() == Src->getParent() && - !dominates(PassthruDef, Src)) { - if (!isSafeToMove(*Src, *PassthruDef->getNextNode())) - return false; - Src->moveBefore(PassthruDef->getNextNode()); - } - } + if (!ensureDominates(Passthru, *Src)) + return false; if (SrcPassthru.getReg() != Passthru.getReg()) { SrcPassthru.setReg(Passthru.getReg()); diff --git a/llvm/test/CodeGen/RISCV/rvv/reduce-vl-peephole.ll b/llvm/test/CodeGen/RISCV/rvv/reduce-vl-peephole.ll new file mode 100644 index 00000000000000..7f70b0ed224ec0 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/reduce-vl-peephole.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s + +define void @avl_not_dominated( %v, ptr %p) { +; CHECK-LABEL: avl_not_dominated: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: slli a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vadd.vi v8, v8, 1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret + %w = add %v, splat (i32 1) + %evl = extractelement %v, i32 0 + call void @llvm.vp.store( %w, ptr %p, splat(i1 true), i32 %evl) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/reduce-vl-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/reduce-vl-peephole.mir new file mode 100644 index 00000000000000..5a223580821b75 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/reduce-vl-peephole.mir @@ -0,0 +1,15 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v -run-pass=riscv-vector-peephole \ +# RUN: -verify-machineinstrs | FileCheck %s +--- +name: avl_not_dominated +body: | + bb.0: + ; CHECK-LABEL: name: avl_not_dominated + ; CHECK: %evl:gprnox0 = ADDI $x0, 1 + ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, %evl, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: PseudoVSE32_V_M1 %x, $noreg, %evl, 5 /* e32 */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */ + %evl:gprnox0 = ADDI $x0, 1 + PseudoVSE32_V_M1 %x:vr, $noreg, %evl, 5 /* e32 */ +... From 130eddf7a13f15c9c48b7fa7faf60e9bbee4f703 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 30 Aug 2024 10:58:32 -0700 Subject: [PATCH 78/98] [lldb] Deal with SupportFiles in SourceManager (NFC) (#106740) To support detecting MD5 checksum mismatches, deal with SupportFiles rather than a plain FileSpecs in the SourceManager. --- lldb/include/lldb/Core/SourceManager.h | 36 ++++---- lldb/source/API/SBSourceManager.cpp | 10 +-- .../BreakpointResolverFileRegex.cpp | 3 +- .../Commands/CommandObjectBreakpoint.cpp | 16 ++-- lldb/source/Commands/CommandObjectSource.cpp | 26 +++--- lldb/source/Core/Disassembler.cpp | 3 +- lldb/source/Core/IOHandlerCursesGUI.cpp | 4 +- lldb/source/Core/SourceManager.cpp | 84 +++++++++---------- lldb/source/Expression/REPL.cpp | 15 ++-- lldb/source/Target/StackFrame.cpp | 2 +- lldb/source/Target/StackFrameList.cpp | 2 +- 11 files changed, 107 insertions(+), 94 deletions(-) diff --git a/lldb/include/lldb/Core/SourceManager.h b/lldb/include/lldb/Core/SourceManager.h index 8feeb4347dd52e..ae7bd3d2311f96 100644 --- a/lldb/include/lldb/Core/SourceManager.h +++ b/lldb/include/lldb/Core/SourceManager.h @@ -141,14 +141,13 @@ class SourceManager { ~SourceManager(); - FileSP GetLastFile() { return GetFile(m_last_file_spec); } + FileSP GetLastFile() { return GetFile(m_last_support_file_sp); } - size_t - DisplaySourceLinesWithLineNumbers(const FileSpec &file, uint32_t line, - uint32_t column, uint32_t context_before, - uint32_t context_after, - const char *current_line_cstr, Stream *s, - const SymbolContextList *bp_locs = nullptr); + size_t DisplaySourceLinesWithLineNumbers( + lldb::SupportFileSP support_file_sp, uint32_t line, uint32_t column, + uint32_t context_before, uint32_t context_after, + const char *current_line_cstr, Stream *s, + const SymbolContextList *bp_locs = nullptr); // This variant uses the last file we visited. size_t DisplaySourceLinesWithLineNumbersUsingLastFile( @@ -159,22 +158,31 @@ class SourceManager { size_t DisplayMoreWithLineNumbers(Stream *s, uint32_t count, bool reverse, const SymbolContextList *bp_locs = nullptr); - bool SetDefaultFileAndLine(const FileSpec &file_spec, uint32_t line); + bool SetDefaultFileAndLine(lldb::SupportFileSP support_file_sp, + uint32_t line); + + struct SupportFileAndLine { + lldb::SupportFileSP support_file_sp; + uint32_t line; + SupportFileAndLine(lldb::SupportFileSP support_file_sp, uint32_t line) + : support_file_sp(support_file_sp), line(line) {} + }; - bool GetDefaultFileAndLine(FileSpec &file_spec, uint32_t &line); + std::optional GetDefaultFileAndLine(); bool DefaultFileAndLineSet() { - return (GetFile(m_last_file_spec).get() != nullptr); + return (GetFile(m_last_support_file_sp).get() != nullptr); } - void FindLinesMatchingRegex(FileSpec &file_spec, RegularExpression ®ex, - uint32_t start_line, uint32_t end_line, + void FindLinesMatchingRegex(lldb::SupportFileSP support_file_sp, + RegularExpression ®ex, uint32_t start_line, + uint32_t end_line, std::vector &match_lines); - FileSP GetFile(const FileSpec &file_spec); + FileSP GetFile(lldb::SupportFileSP support_file_sp); protected: - FileSpec m_last_file_spec; + lldb::SupportFileSP m_last_support_file_sp; uint32_t m_last_line; uint32_t m_last_count; bool m_default_set; diff --git a/lldb/source/API/SBSourceManager.cpp b/lldb/source/API/SBSourceManager.cpp index e46f990698d826..4b96f1222bc88f 100644 --- a/lldb/source/API/SBSourceManager.cpp +++ b/lldb/source/API/SBSourceManager.cpp @@ -46,15 +46,15 @@ class SourceManagerImpl { lldb::TargetSP target_sp(m_target_wp.lock()); if (target_sp) { return target_sp->GetSourceManager().DisplaySourceLinesWithLineNumbers( - file, line, column, context_before, context_after, current_line_cstr, - s); + std::make_shared(file), line, column, context_before, + context_after, current_line_cstr, s); } else { lldb::DebuggerSP debugger_sp(m_debugger_wp.lock()); if (debugger_sp) { return debugger_sp->GetSourceManager() - .DisplaySourceLinesWithLineNumbers(file, line, column, - context_before, context_after, - current_line_cstr, s); + .DisplaySourceLinesWithLineNumbers( + std::make_shared(file), line, column, + context_before, context_after, current_line_cstr, s); } } return 0; diff --git a/lldb/source/Breakpoint/BreakpointResolverFileRegex.cpp b/lldb/source/Breakpoint/BreakpointResolverFileRegex.cpp index 0509924e6300be..05fa7b93096889 100644 --- a/lldb/source/Breakpoint/BreakpointResolverFileRegex.cpp +++ b/lldb/source/Breakpoint/BreakpointResolverFileRegex.cpp @@ -102,7 +102,8 @@ Searcher::CallbackReturn BreakpointResolverFileRegex::SearchCallback( FileSpec cu_file_spec = cu->GetPrimaryFile(); std::vector line_matches; context.target_sp->GetSourceManager().FindLinesMatchingRegex( - cu_file_spec, m_regex, 1, UINT32_MAX, line_matches); + std::make_shared(cu_file_spec), m_regex, 1, UINT32_MAX, + line_matches); uint32_t num_matches = line_matches.size(); for (uint32_t i = 0; i < num_matches; i++) { diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp index abde27b2b53ad8..ede3dd2f2a864c 100644 --- a/lldb/source/Commands/CommandObjectBreakpoint.cpp +++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp @@ -769,20 +769,26 @@ class CommandObjectBreakpointSet : public CommandObjectParsed { private: bool GetDefaultFile(Target &target, FileSpec &file, CommandReturnObject &result) { - uint32_t default_line; // First use the Source Manager's default file. Then use the current stack // frame's file. - if (!target.GetSourceManager().GetDefaultFileAndLine(file, default_line)) { + if (auto maybe_file_and_line = + target.GetSourceManager().GetDefaultFileAndLine()) { + file = maybe_file_and_line->support_file_sp->GetSpecOnly(); + return true; + } + StackFrame *cur_frame = m_exe_ctx.GetFramePtr(); if (cur_frame == nullptr) { result.AppendError( "No selected frame to use to find the default file."); return false; - } else if (!cur_frame->HasDebugInformation()) { + } + if (!cur_frame->HasDebugInformation()) { result.AppendError("Cannot use the selected frame to find the default " "file, it has no debug info."); return false; - } else { + } + const SymbolContext &sc = cur_frame->GetSymbolContext(eSymbolContextLineEntry); if (sc.line_entry.GetFile()) { @@ -791,8 +797,6 @@ class CommandObjectBreakpointSet : public CommandObjectParsed { result.AppendError("Can't find the file for the selected frame to " "use as the default file."); return false; - } - } } return true; } diff --git a/lldb/source/Commands/CommandObjectSource.cpp b/lldb/source/Commands/CommandObjectSource.cpp index 1a0629c6765d41..1fc122420388d8 100644 --- a/lldb/source/Commands/CommandObjectSource.cpp +++ b/lldb/source/Commands/CommandObjectSource.cpp @@ -777,14 +777,16 @@ class CommandObjectSourceList : public CommandObjectParsed { if (sc.function) { Target &target = GetTarget(); - FileSpec start_file; + SupportFileSP start_file = std::make_shared(); uint32_t start_line; uint32_t end_line; FileSpec end_file; if (sc.block == nullptr) { // Not an inlined function - sc.function->GetStartLineSourceInfo(start_file, start_line); + FileSpec function_file_spec; + sc.function->GetStartLineSourceInfo(function_file_spec, start_line); + start_file = std::make_shared(function_file_spec); if (start_line == 0) { result.AppendErrorWithFormat("Could not find line information for " "start of function: \"%s\".\n", @@ -794,7 +796,7 @@ class CommandObjectSourceList : public CommandObjectParsed { sc.function->GetEndLineSourceInfo(end_file, end_line); } else { // We have an inlined function - start_file = source_info.line_entry.GetFile(); + start_file = source_info.line_entry.file_sp; start_line = source_info.line_entry.line; end_line = start_line + m_options.num_lines; } @@ -825,14 +827,15 @@ class CommandObjectSourceList : public CommandObjectParsed { if (m_options.show_bp_locs) { const bool show_inlines = true; - m_breakpoint_locations.Reset(start_file, 0, show_inlines); + m_breakpoint_locations.Reset(start_file->GetSpecOnly(), 0, + show_inlines); SearchFilterForUnconstrainedSearches target_search_filter( m_exe_ctx.GetTargetSP()); target_search_filter.Search(m_breakpoint_locations); } - result.AppendMessageWithFormat("File: %s\n", - start_file.GetPath().c_str()); + result.AppendMessageWithFormat( + "File: %s\n", start_file->GetSpecOnly().GetPath().c_str()); // We don't care about the column here. const uint32_t column = 0; return target.GetSourceManager().DisplaySourceLinesWithLineNumbers( @@ -1050,8 +1053,9 @@ class CommandObjectSourceList : public CommandObjectParsed { ? sc.line_entry.column : 0; target.GetSourceManager().DisplaySourceLinesWithLineNumbers( - sc.comp_unit->GetPrimaryFile(), sc.line_entry.line, column, - lines_to_back_up, m_options.num_lines - lines_to_back_up, "->", + std::make_shared(sc.comp_unit->GetPrimaryFile()), + sc.line_entry.line, column, lines_to_back_up, + m_options.num_lines - lines_to_back_up, "->", &result.GetOutputStream(), GetBreakpointLocations()); result.SetStatus(eReturnStatusSuccessFinishResult); } @@ -1170,9 +1174,9 @@ class CommandObjectSourceList : public CommandObjectParsed { m_options.num_lines = 10; const uint32_t column = 0; target.GetSourceManager().DisplaySourceLinesWithLineNumbers( - sc.comp_unit->GetPrimaryFile(), m_options.start_line, column, 0, - m_options.num_lines, "", &result.GetOutputStream(), - GetBreakpointLocations()); + std::make_shared(sc.comp_unit->GetPrimaryFile()), + m_options.start_line, column, 0, m_options.num_lines, "", + &result.GetOutputStream(), GetBreakpointLocations()); result.SetStatus(eReturnStatusSuccessFinishResult); } else { diff --git a/lldb/source/Core/Disassembler.cpp b/lldb/source/Core/Disassembler.cpp index 9286f62058bc8d..d071e3bfe4f77d 100644 --- a/lldb/source/Core/Disassembler.cpp +++ b/lldb/source/Core/Disassembler.cpp @@ -517,7 +517,8 @@ void Disassembler::PrintInstructions(Debugger &debugger, const ArchSpec &arch, line_highlight = "**"; } source_manager.DisplaySourceLinesWithLineNumbers( - ln.file, ln.line, ln.column, 0, 0, line_highlight, &strm); + std::make_shared(ln.file), ln.line, ln.column, 0, 0, + line_highlight, &strm); } if (source_lines_to_display.print_source_context_end_eol) strm.EOL(); diff --git a/lldb/source/Core/IOHandlerCursesGUI.cpp b/lldb/source/Core/IOHandlerCursesGUI.cpp index 8f44e3d0cd016b..3d69aedb6b13ee 100644 --- a/lldb/source/Core/IOHandlerCursesGUI.cpp +++ b/lldb/source/Core/IOHandlerCursesGUI.cpp @@ -6910,8 +6910,8 @@ class SourceFileWindowDelegate : public WindowDelegate { } else { // File changed, set selected line to the line with the PC m_selected_line = m_pc_line; - m_file_sp = m_debugger.GetSourceManager().GetFile( - m_sc.line_entry.GetFile()); + m_file_sp = + m_debugger.GetSourceManager().GetFile(m_sc.line_entry.file_sp); if (m_file_sp) { const size_t num_lines = m_file_sp->GetNumLines(); m_line_width = 1; diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp index cd0011a25f1c39..c427bb91f4643a 100644 --- a/lldb/source/Core/SourceManager.cpp +++ b/lldb/source/Core/SourceManager.cpp @@ -63,18 +63,22 @@ static void resolve_tilde(FileSpec &file_spec) { // SourceManager constructor SourceManager::SourceManager(const TargetSP &target_sp) - : m_last_line(0), m_last_count(0), m_default_set(false), - m_target_wp(target_sp), + : m_last_support_file_sp(std::make_shared()), m_last_line(0), + m_last_count(0), m_default_set(false), m_target_wp(target_sp), m_debugger_wp(target_sp->GetDebugger().shared_from_this()) {} SourceManager::SourceManager(const DebuggerSP &debugger_sp) - : m_last_line(0), m_last_count(0), m_default_set(false), m_target_wp(), + : m_last_support_file_sp(std::make_shared()), m_last_line(0), + m_last_count(0), m_default_set(false), m_target_wp(), m_debugger_wp(debugger_sp) {} // Destructor SourceManager::~SourceManager() = default; -SourceManager::FileSP SourceManager::GetFile(const FileSpec &file_spec) { +SourceManager::FileSP SourceManager::GetFile(SupportFileSP support_file_sp) { + assert(support_file_sp && "SupportFileSP must be valid"); + + FileSpec file_spec = support_file_sp->GetSpecOnly(); if (!file_spec) return {}; @@ -87,10 +91,8 @@ SourceManager::FileSP SourceManager::GetFile(const FileSpec &file_spec) { LLDB_LOG(log, "Source file caching disabled: creating new source file: {0}", file_spec); if (target_sp) - return std::make_shared(std::make_shared(file_spec), - target_sp); - return std::make_shared(std::make_shared(file_spec), - debugger_sp); + return std::make_shared(support_file_sp, target_sp); + return std::make_shared(support_file_sp, debugger_sp); } ProcessSP process_sp = target_sp ? target_sp->GetProcessSP() : ProcessSP(); @@ -151,11 +153,9 @@ SourceManager::FileSP SourceManager::GetFile(const FileSpec &file_spec) { // (Re)create the file. if (target_sp) - file_sp = std::make_shared(std::make_shared(file_spec), - target_sp); + file_sp = std::make_shared(support_file_sp, target_sp); else - file_sp = std::make_shared(std::make_shared(file_spec), - debugger_sp); + file_sp = std::make_shared(support_file_sp, debugger_sp); // Add the file to the debugger and process cache. If the file was // invalidated, this will overwrite it. @@ -235,11 +235,8 @@ size_t SourceManager::DisplaySourceLinesWithLineNumbersUsingLastFile( start_line = 1; } - if (!m_default_set) { - FileSpec tmp_spec; - uint32_t tmp_line; - GetDefaultFileAndLine(tmp_spec, tmp_line); - } + if (!m_default_set) + GetDefaultFileAndLine(); m_last_line = start_line; m_last_count = count; @@ -310,11 +307,12 @@ size_t SourceManager::DisplaySourceLinesWithLineNumbersUsingLastFile( } size_t SourceManager::DisplaySourceLinesWithLineNumbers( - const FileSpec &file_spec, uint32_t line, uint32_t column, + lldb::SupportFileSP support_file_sp, uint32_t line, uint32_t column, uint32_t context_before, uint32_t context_after, const char *current_line_cstr, Stream *s, const SymbolContextList *bp_locs) { - FileSP file_sp(GetFile(file_spec)); + assert(support_file_sp && "SupportFile must be valid"); + FileSP file_sp(GetFile(support_file_sp)); uint32_t start_line; uint32_t count = context_before + context_after + 1; @@ -327,8 +325,9 @@ size_t SourceManager::DisplaySourceLinesWithLineNumbers( if (last_file_sp.get() != file_sp.get()) { if (line == 0) m_last_line = 0; - m_last_file_spec = file_spec; + m_last_support_file_sp = support_file_sp; } + return DisplaySourceLinesWithLineNumbersUsingLastFile( start_line, count, line, column, current_line_cstr, s, bp_locs); } @@ -339,11 +338,8 @@ size_t SourceManager::DisplayMoreWithLineNumbers( // to figure it out here. FileSP last_file_sp(GetLastFile()); const bool have_default_file_line = last_file_sp && m_last_line > 0; - if (!m_default_set) { - FileSpec tmp_spec; - uint32_t tmp_line; - GetDefaultFileAndLine(tmp_spec, tmp_line); - } + if (!m_default_set) + GetDefaultFileAndLine(); if (last_file_sp) { if (m_last_line == UINT32_MAX) @@ -378,26 +374,27 @@ size_t SourceManager::DisplayMoreWithLineNumbers( return 0; } -bool SourceManager::SetDefaultFileAndLine(const FileSpec &file_spec, +bool SourceManager::SetDefaultFileAndLine(lldb::SupportFileSP support_file_sp, uint32_t line) { + assert(support_file_sp && "SupportFile must be valid"); + m_default_set = true; - FileSP file_sp(GetFile(file_spec)); - if (file_sp) { + if (FileSP file_sp = GetFile(support_file_sp)) { m_last_line = line; - m_last_file_spec = file_spec; + m_last_support_file_sp = support_file_sp; return true; - } else { - return false; } + + return false; } -bool SourceManager::GetDefaultFileAndLine(FileSpec &file_spec, uint32_t &line) { - if (FileSP last_file_sp = GetLastFile()) { - file_spec = m_last_file_spec; - line = m_last_line; - return true; - } else if (!m_default_set) { +std::optional +SourceManager::GetDefaultFileAndLine() { + if (FileSP last_file_sp = GetLastFile()) + return SupportFileAndLine(m_last_support_file_sp, m_last_line); + + if (!m_default_set) { TargetSP target_sp(m_target_wp.lock()); if (target_sp) { @@ -423,26 +420,25 @@ bool SourceManager::GetDefaultFileAndLine(FileSpec &file_spec, uint32_t &line) { if (sc.function->GetAddressRange() .GetBaseAddress() .CalculateSymbolContextLineEntry(line_entry)) { - SetDefaultFileAndLine(line_entry.GetFile(), line_entry.line); - file_spec = m_last_file_spec; - line = m_last_line; - return true; + SetDefaultFileAndLine(line_entry.file_sp, line_entry.line); + return SupportFileAndLine(line_entry.file_sp, m_last_line); } } } } } } - return false; + + return std::nullopt; } -void SourceManager::FindLinesMatchingRegex(FileSpec &file_spec, +void SourceManager::FindLinesMatchingRegex(SupportFileSP support_file_sp, RegularExpression ®ex, uint32_t start_line, uint32_t end_line, std::vector &match_lines) { match_lines.clear(); - FileSP file_sp = GetFile(file_spec); + FileSP file_sp = GetFile(support_file_sp); if (!file_sp) return; return file_sp->FindLinesMatchingRegex(regex, start_line, end_line, diff --git a/lldb/source/Expression/REPL.cpp b/lldb/source/Expression/REPL.cpp index a6a4ffb5e0af9e..56c50e346b39b8 100644 --- a/lldb/source/Expression/REPL.cpp +++ b/lldb/source/Expression/REPL.cpp @@ -473,7 +473,8 @@ void REPL::IOHandlerInputComplete(IOHandler &io_handler, std::string &code) { // Now set the default file and line to the REPL source file m_target.GetSourceManager().SetDefaultFileAndLine( - FileSpec(m_repl_source_path), new_default_line); + std::make_shared(FileSpec(m_repl_source_path)), + new_default_line); } static_cast(io_handler) .SetBaseLineNumber(m_code.GetSize() + 1); @@ -570,13 +571,11 @@ Status REPL::RunLoop() { lldb::IOHandlerSP io_handler_sp(GetIOHandler()); - FileSpec save_default_file; - uint32_t save_default_line = 0; + std::optional default_file_line; if (!m_repl_source_path.empty()) { // Save the current default file and line - m_target.GetSourceManager().GetDefaultFileAndLine(save_default_file, - save_default_line); + default_file_line = m_target.GetSourceManager().GetDefaultFileAndLine(); } debugger.RunIOHandlerAsync(io_handler_sp); @@ -615,8 +614,8 @@ Status REPL::RunLoop() { } // Restore the default file and line - if (save_default_file && save_default_line != 0) - m_target.GetSourceManager().SetDefaultFileAndLine(save_default_file, - save_default_line); + if (default_file_line) + m_target.GetSourceManager().SetDefaultFileAndLine( + default_file_line->support_file_sp, default_file_line->line); return error; } diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp index 5d90ed90b3d3fd..e35a4c318d358f 100644 --- a/lldb/source/Target/StackFrame.cpp +++ b/lldb/source/Target/StackFrame.cpp @@ -1923,7 +1923,7 @@ bool StackFrame::GetStatus(Stream &strm, bool show_frame_info, bool show_source, size_t num_lines = target->GetSourceManager().DisplaySourceLinesWithLineNumbers( - m_sc.line_entry.GetFile(), start_line, m_sc.line_entry.column, + m_sc.line_entry.file_sp, start_line, m_sc.line_entry.column, source_lines_before, source_lines_after, "->", &strm); if (num_lines != 0) have_source = true; diff --git a/lldb/source/Target/StackFrameList.cpp b/lldb/source/Target/StackFrameList.cpp index 7808bd3674ab19..3849ec5ed178d9 100644 --- a/lldb/source/Target/StackFrameList.cpp +++ b/lldb/source/Target/StackFrameList.cpp @@ -886,7 +886,7 @@ void StackFrameList::SetDefaultFileAndLineToSelectedFrame() { SymbolContext sc = frame_sp->GetSymbolContext(eSymbolContextLineEntry); if (sc.line_entry.GetFile()) m_thread.CalculateTarget()->GetSourceManager().SetDefaultFileAndLine( - sc.line_entry.GetFile(), sc.line_entry.line); + sc.line_entry.file_sp, sc.line_entry.line); } } } From 2c7e1b8893061fdf487f2d9945d2d1eecd59a604 Mon Sep 17 00:00:00 2001 From: vporpo Date: Fri, 30 Aug 2024 11:09:40 -0700 Subject: [PATCH 79/98] [SandboxIR] Implement ConstantFP (#106648) This patch implements sandboxir::ConstantFP mirroring llvm::ConstantFP. --- llvm/include/llvm/SandboxIR/SandboxIR.h | 94 ++++++++++- .../llvm/SandboxIR/SandboxIRValues.def | 1 + llvm/include/llvm/SandboxIR/Type.h | 3 +- llvm/lib/SandboxIR/SandboxIR.cpp | 52 ++++++ llvm/unittests/SandboxIR/SandboxIRTest.cpp | 155 ++++++++++++++++++ 5 files changed, 303 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 0f7752eda6d66f..2ed7243fa612f4 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -113,6 +113,7 @@ namespace sandboxir { class BasicBlock; class ConstantInt; +class ConstantFP; class Context; class Function; class Instruction; @@ -597,6 +598,94 @@ class ConstantInt : public Constant { #endif }; +// TODO: This should inherit from ConstantData. +class ConstantFP final : public Constant { + ConstantFP(llvm::ConstantFP *C, Context &Ctx) + : Constant(ClassID::ConstantFP, C, Ctx) {} + friend class Context; // For constructor. + +public: + /// This returns a ConstantFP, or a vector containing a splat of a ConstantFP, + /// for the specified value in the specified type. This should only be used + /// for simple constant values like 2.0/1.0 etc, that are known-valid both as + /// host double and as the target format. + static Constant *get(Type *Ty, double V); + + /// If Ty is a vector type, return a Constant with a splat of the given + /// value. Otherwise return a ConstantFP for the given value. + static Constant *get(Type *Ty, const APFloat &V); + + static Constant *get(Type *Ty, StringRef Str); + + static ConstantFP *get(const APFloat &V, Context &Ctx); + + static Constant *getNaN(Type *Ty, bool Negative = false, + uint64_t Payload = 0); + static Constant *getQNaN(Type *Ty, bool Negative = false, + APInt *Payload = nullptr); + static Constant *getSNaN(Type *Ty, bool Negative = false, + APInt *Payload = nullptr); + static Constant *getZero(Type *Ty, bool Negative = false); + + static Constant *getNegativeZero(Type *Ty); + static Constant *getInfinity(Type *Ty, bool Negative = false); + + /// Return true if Ty is big enough to represent V. + static bool isValueValidForType(Type *Ty, const APFloat &V); + + inline const APFloat &getValueAPF() const { + return cast(Val)->getValueAPF(); + } + inline const APFloat &getValue() const { + return cast(Val)->getValue(); + } + + /// Return true if the value is positive or negative zero. + bool isZero() const { return cast(Val)->isZero(); } + + /// Return true if the sign bit is set. + bool isNegative() const { return cast(Val)->isNegative(); } + + /// Return true if the value is infinity + bool isInfinity() const { return cast(Val)->isInfinity(); } + + /// Return true if the value is a NaN. + bool isNaN() const { return cast(Val)->isNaN(); } + + /// We don't rely on operator== working on double values, as it returns true + /// for things that are clearly not equal, like -0.0 and 0.0. + /// As such, this method can be used to do an exact bit-for-bit comparison of + /// two floating point values. The version with a double operand is retained + /// because it's so convenient to write isExactlyValue(2.0), but please use + /// it only for simple constants. + bool isExactlyValue(const APFloat &V) const { + return cast(Val)->isExactlyValue(V); + } + + bool isExactlyValue(double V) const { + return cast(Val)->isExactlyValue(V); + } + + /// For isa/dyn_cast. + static bool classof(const sandboxir::Value *From) { + return From->getSubclassID() == ClassID::ConstantFP; + } + + // TODO: Better name: getOperandNo(const Use&). Should be private. + unsigned getUseOperandNo(const Use &Use) const final { + llvm_unreachable("ConstantFP has no operands!"); + } +#ifndef NDEBUG + void verify() const override { + assert(isa(Val) && "Expected a ConstantFP!"); + } + void dumpOS(raw_ostream &OS) const override { + dumpCommonPrefix(OS); + dumpCommonSuffix(OS); + } +#endif +}; + /// Iterator for `Instruction`s in a `BasicBlock. /// \Returns an sandboxir::Instruction & when derereferenced. class BBIterator { @@ -3156,7 +3245,10 @@ class Context { Constant *getOrCreateConstant(llvm::Constant *LLVMC) { return cast(getOrCreateValueInternal(LLVMC, 0)); } - friend class ConstantInt; // For getOrCreateConstant(). + // Friends for getOrCreateConstant(). +#define DEF_CONST(ID, CLASS) friend class CLASS; +#include "llvm/SandboxIR/SandboxIRValues.def" + /// Create a sandboxir::BasicBlock for an existing LLVM IR \p BB. This will /// also create all contents of the block. BasicBlock *createBasicBlock(llvm::BasicBlock *BB); diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def index d29fc3b5e95871..2fc24ed71c4cf6 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def +++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def @@ -26,6 +26,7 @@ DEF_USER(User, User) DEF_VALUE(Block, BasicBlock) DEF_CONST(Constant, Constant) DEF_CONST(ConstantInt, ConstantInt) +DEF_CONST(ConstantFP, ConstantFP) #ifndef DEF_INSTR #define DEF_INSTR(ID, OPCODE, CLASS) diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h index 4588cd2f738876..89e787f5f5d4b2 100644 --- a/llvm/include/llvm/SandboxIR/Type.h +++ b/llvm/include/llvm/SandboxIR/Type.h @@ -27,6 +27,7 @@ class PointerType; class VectorType; class FunctionType; #define DEF_INSTR(ID, OPCODE, CLASS) class CLASS; +#define DEF_CONST(ID, CLASS) class CLASS; #include "llvm/SandboxIR/SandboxIRValues.def" /// Just like llvm::Type these are immutable, unique, never get freed and can @@ -42,7 +43,7 @@ class Type { friend class ConstantInt; // For LLVMTy. // Friend all instruction classes because `create()` functions use LLVMTy. #define DEF_INSTR(ID, OPCODE, CLASS) friend class CLASS; - // TODO: Friend DEF_CONST() +#define DEF_CONST(ID, CLASS) friend class CLASS; #include "llvm/SandboxIR/SandboxIRValues.def" Context &Ctx; diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index bf224b73f3bad2..6bdc580f751d18 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -2248,6 +2248,54 @@ ConstantInt *ConstantInt::get(Type *Ty, uint64_t V, bool IsSigned) { return cast(Ty->getContext().getOrCreateConstant(LLVMC)); } +Constant *ConstantFP::get(Type *Ty, double V) { + auto *LLVMC = llvm::ConstantFP::get(Ty->LLVMTy, V); + return Ty->getContext().getOrCreateConstant(LLVMC); +} + +Constant *ConstantFP::get(Type *Ty, const APFloat &V) { + auto *LLVMC = llvm::ConstantFP::get(Ty->LLVMTy, V); + return Ty->getContext().getOrCreateConstant(LLVMC); +} + +Constant *ConstantFP::get(Type *Ty, StringRef Str) { + auto *LLVMC = llvm::ConstantFP::get(Ty->LLVMTy, Str); + return Ty->getContext().getOrCreateConstant(LLVMC); +} + +ConstantFP *ConstantFP::get(const APFloat &V, Context &Ctx) { + auto *LLVMC = llvm::ConstantFP::get(Ctx.LLVMCtx, V); + return cast(Ctx.getOrCreateConstant(LLVMC)); +} + +Constant *ConstantFP::getNaN(Type *Ty, bool Negative, uint64_t Payload) { + auto *LLVMC = llvm::ConstantFP::getNaN(Ty->LLVMTy, Negative, Payload); + return cast(Ty->getContext().getOrCreateConstant(LLVMC)); +} +Constant *ConstantFP::getQNaN(Type *Ty, bool Negative, APInt *Payload) { + auto *LLVMC = llvm::ConstantFP::getQNaN(Ty->LLVMTy, Negative, Payload); + return cast(Ty->getContext().getOrCreateConstant(LLVMC)); +} +Constant *ConstantFP::getSNaN(Type *Ty, bool Negative, APInt *Payload) { + auto *LLVMC = llvm::ConstantFP::getSNaN(Ty->LLVMTy, Negative, Payload); + return cast(Ty->getContext().getOrCreateConstant(LLVMC)); +} +Constant *ConstantFP::getZero(Type *Ty, bool Negative) { + auto *LLVMC = llvm::ConstantFP::getZero(Ty->LLVMTy, Negative); + return cast(Ty->getContext().getOrCreateConstant(LLVMC)); +} +Constant *ConstantFP::getNegativeZero(Type *Ty) { + auto *LLVMC = llvm::ConstantFP::getNegativeZero(Ty->LLVMTy); + return cast(Ty->getContext().getOrCreateConstant(LLVMC)); +} +Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) { + auto *LLVMC = llvm::ConstantFP::getInfinity(Ty->LLVMTy, Negative); + return cast(Ty->getContext().getOrCreateConstant(LLVMC)); +} +bool ConstantFP::isValueValidForType(Type *Ty, const APFloat &V) { + return llvm::ConstantFP::isValueValidForType(Ty->LLVMTy, V); +} + FunctionType *Function::getFunctionType() const { return cast( Ctx.getType(cast(Val)->getFunctionType())); @@ -2339,6 +2387,10 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { It->second = std::unique_ptr(new ConstantInt(CI, *this)); return It->second.get(); } + if (auto *CF = dyn_cast(C)) { + It->second = std::unique_ptr(new ConstantFP(CF, *this)); + return It->second.get(); + } if (auto *F = dyn_cast(LLVMV)) It->second = std::unique_ptr(new Function(F, *this)); else diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index c543846eb2686e..01fe21eb5cfa43 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -130,6 +130,161 @@ define void @foo(i32 %v0) { EXPECT_NE(FortyThree, FortyTwo); } +TEST_F(SandboxIRTest, ConstantFP) { + parseIR(C, R"IR( +define void @foo(float %v0, double %v1) { + %fadd0 = fadd float %v0, 42.0 + %fadd1 = fadd double %v1, 43.0 + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + auto &F = *Ctx.createFunction(&LLVMF); + auto &BB = *F.begin(); + auto It = BB.begin(); + auto *FAdd0 = cast(&*It++); + auto *FAdd1 = cast(&*It++); + auto *FortyTwo = cast(FAdd0->getOperand(1)); + [[maybe_unused]] auto *FortyThree = + cast(FAdd1->getOperand(1)); + + auto *FloatTy = sandboxir::Type::getFloatTy(Ctx); + auto *DoubleTy = sandboxir::Type::getDoubleTy(Ctx); + auto *LLVMFloatTy = Type::getFloatTy(C); + auto *LLVMDoubleTy = Type::getDoubleTy(C); + // Check that creating an identical constant gives us the same object. + auto *NewFortyTwo = sandboxir::ConstantFP::get(FloatTy, 42.0); + EXPECT_EQ(NewFortyTwo, FortyTwo); + // Check get(Type, double). + auto *FortyFour = + cast(sandboxir::ConstantFP::get(FloatTy, 44.0)); + auto *LLVMFortyFour = + cast(llvm::ConstantFP::get(LLVMFloatTy, 44.0)); + EXPECT_NE(FortyFour, FortyTwo); + EXPECT_EQ(FortyFour, Ctx.getValue(LLVMFortyFour)); + // Check get(Type, APFloat). + auto *FortyFive = cast( + sandboxir::ConstantFP::get(DoubleTy, APFloat(45.0))); + auto *LLVMFortyFive = cast( + llvm::ConstantFP::get(LLVMDoubleTy, APFloat(45.0))); + EXPECT_EQ(FortyFive, Ctx.getValue(LLVMFortyFive)); + // Check get(Type, StringRef). + auto *FortySix = sandboxir::ConstantFP::get(FloatTy, "46.0"); + EXPECT_EQ(FortySix, Ctx.getValue(llvm::ConstantFP::get(LLVMFloatTy, "46.0"))); + // Check get(APFloat). + auto *FortySeven = sandboxir::ConstantFP::get(APFloat(47.0), Ctx); + EXPECT_EQ(FortySeven, Ctx.getValue(llvm::ConstantFP::get(C, APFloat(47.0)))); + // Check getNaN(). + { + auto *NaN = sandboxir::ConstantFP::getNaN(FloatTy); + EXPECT_EQ(NaN, Ctx.getValue(llvm::ConstantFP::getNaN(LLVMFloatTy))); + } + { + auto *NaN = sandboxir::ConstantFP::getNaN(FloatTy, /*Negative=*/true); + EXPECT_EQ(NaN, Ctx.getValue(llvm::ConstantFP::getNaN(LLVMFloatTy, + /*Negative=*/true))); + } + { + auto *NaN = sandboxir::ConstantFP::getNaN(FloatTy, /*Negative=*/true, + /*Payload=*/1); + EXPECT_EQ(NaN, Ctx.getValue(llvm::ConstantFP::getNaN( + LLVMFloatTy, /*Negative=*/true, /*Payload=*/1))); + } + // Check getQNaN(). + { + auto *QNaN = sandboxir::ConstantFP::getQNaN(FloatTy); + EXPECT_EQ(QNaN, Ctx.getValue(llvm::ConstantFP::getQNaN(LLVMFloatTy))); + } + { + auto *QNaN = sandboxir::ConstantFP::getQNaN(FloatTy, /*Negative=*/true); + EXPECT_EQ(QNaN, Ctx.getValue(llvm::ConstantFP::getQNaN(LLVMFloatTy, + /*Negative=*/true))); + } + { + APInt Payload(1, 1); + auto *QNaN = + sandboxir::ConstantFP::getQNaN(FloatTy, /*Negative=*/true, &Payload); + EXPECT_EQ(QNaN, Ctx.getValue(llvm::ConstantFP::getQNaN( + LLVMFloatTy, /*Negative=*/true, &Payload))); + } + // Check getSNaN(). + { + auto *SNaN = sandboxir::ConstantFP::getSNaN(FloatTy); + EXPECT_EQ(SNaN, Ctx.getValue(llvm::ConstantFP::getSNaN(LLVMFloatTy))); + } + { + auto *SNaN = sandboxir::ConstantFP::getSNaN(FloatTy, /*Negative=*/true); + EXPECT_EQ(SNaN, Ctx.getValue(llvm::ConstantFP::getSNaN(LLVMFloatTy, + /*Negative=*/true))); + } + { + APInt Payload(1, 1); + auto *SNaN = + sandboxir::ConstantFP::getSNaN(FloatTy, /*Negative=*/true, &Payload); + EXPECT_EQ(SNaN, Ctx.getValue(llvm::ConstantFP::getSNaN( + LLVMFloatTy, /*Negative=*/true, &Payload))); + } + + // Check getZero(). + { + auto *Zero = sandboxir::ConstantFP::getZero(FloatTy); + EXPECT_EQ(Zero, Ctx.getValue(llvm::ConstantFP::getZero(LLVMFloatTy))); + } + { + auto *Zero = sandboxir::ConstantFP::getZero(FloatTy, /*Negative=*/true); + EXPECT_EQ(Zero, Ctx.getValue(llvm::ConstantFP::getZero(LLVMFloatTy, + /*Negative=*/true))); + } + + // Check getNegativeZero(). + auto *NegZero = cast( + sandboxir::ConstantFP::getNegativeZero(FloatTy)); + EXPECT_EQ(NegZero, + Ctx.getValue(llvm::ConstantFP::getNegativeZero(LLVMFloatTy))); + + // Check getInfinity(). + { + auto *Inf = sandboxir::ConstantFP::getInfinity(FloatTy); + EXPECT_EQ(Inf, Ctx.getValue(llvm::ConstantFP::getInfinity(LLVMFloatTy))); + } + { + auto *Inf = sandboxir::ConstantFP::getInfinity(FloatTy, /*Negative=*/true); + EXPECT_EQ(Inf, Ctx.getValue(llvm::ConstantFP::getInfinity( + LLVMFloatTy, /*Negative=*/true))); + } + + // Check isValueValidForType(). + APFloat V(1.1); + EXPECT_EQ(sandboxir::ConstantFP::isValueValidForType(FloatTy, V), + llvm::ConstantFP::isValueValidForType(LLVMFloatTy, V)); + // Check getValueAPF(). + EXPECT_EQ(FortyFour->getValueAPF(), LLVMFortyFour->getValueAPF()); + // Check getValue(). + EXPECT_EQ(FortyFour->getValue(), LLVMFortyFour->getValue()); + // Check isZero(). + EXPECT_EQ(FortyFour->isZero(), LLVMFortyFour->isZero()); + EXPECT_TRUE(sandboxir::ConstantFP::getZero(FloatTy)); + EXPECT_TRUE(sandboxir::ConstantFP::getZero(FloatTy, /*Negative=*/true)); + // Check isNegative(). + EXPECT_TRUE(cast( + sandboxir::ConstantFP::getZero(FloatTy, /*Negative=*/true)) + ->isNegative()); + // Check isInfinity(). + EXPECT_TRUE( + cast(sandboxir::ConstantFP::getInfinity(FloatTy)) + ->isInfinity()); + // Check isNaN(). + EXPECT_TRUE( + cast(sandboxir::ConstantFP::getNaN(FloatTy)) + ->isNaN()); + // Check isExactlyValue(APFloat). + EXPECT_TRUE(NegZero->isExactlyValue(NegZero->getValueAPF())); + // Check isExactlyValue(double). + EXPECT_TRUE(NegZero->isExactlyValue(-0.0)); +} + TEST_F(SandboxIRTest, Use) { parseIR(C, R"IR( define i32 @foo(i32 %v0, i32 %v1) { From 07178981246c56e8beafe7fe49f0f442436f08c4 Mon Sep 17 00:00:00 2001 From: rjmansfield Date: Fri, 30 Aug 2024 14:15:05 -0400 Subject: [PATCH 80/98] Fix cl::desc typos in aarch64-enable-dead-defs and arm-implicit-it. (#106712) --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp | 2 +- llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index bd5684a287381a..9f96f6c5e83ec4 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -98,7 +98,7 @@ static cl::opt EnableCollectLOH( static cl::opt EnableDeadRegisterElimination("aarch64-enable-dead-defs", cl::Hidden, cl::desc("Enable the pass that removes dead" - " definitons and replaces stores to" + " definitions and replaces stores to" " them with stores to the zero" " register"), cl::init(true)); diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index b7dfcc15824dc7..10fef901f77181 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -79,7 +79,7 @@ enum class ImplicitItModeTy { Always, Never, ARMOnly, ThumbOnly }; static cl::opt ImplicitItMode( "arm-implicit-it", cl::init(ImplicitItModeTy::ARMOnly), - cl::desc("Allow conditional instructions outdside of an IT block"), + cl::desc("Allow conditional instructions outside of an IT block"), cl::values(clEnumValN(ImplicitItModeTy::Always, "always", "Accept in both ISAs, emit implicit ITs in Thumb"), clEnumValN(ImplicitItModeTy::Never, "never", From c49770c60f26e449379447109f7d915bd8de0384 Mon Sep 17 00:00:00 2001 From: Nicolas van Kempen Date: Fri, 30 Aug 2024 14:26:49 -0400 Subject: [PATCH 81/98] [NFC] Prefer subprocess.DEVNULL over os.devnull (#106500) There is no need to support Python 2.7 anymore, Python 3.3+ has `subprocess.DEVNULL`. This is good practice and also prevents file handles from staying open unnecessarily. Also remove a couple unused or unneeded `__future__` imports. --- .../clang-tidy/tool/run-clang-tidy.py | 10 ++++------ clang/docs/tools/generate_formatted_state.py | 6 ++---- clang/tools/scan-view/share/startfile.py | 2 +- clang/utils/creduce-clang-crash.py | 4 +--- lldb/bindings/interface/SBErrorDocstrings.i | 6 ++++-- .../packages/Python/lldbsuite/test/decorators.py | 6 ++---- lldb/packages/Python/lldbsuite/test/lldbtest.py | 3 +-- llvm/utils/UpdateTestChecks/common.py | 16 +++++++--------- llvm/utils/git/pre-push.py | 15 ++------------- llvm/utils/gn/gn.py | 2 +- 10 files changed, 25 insertions(+), 45 deletions(-) diff --git a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py index 48401ba5ea42a9..b702eece37002b 100755 --- a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py +++ b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py @@ -511,12 +511,10 @@ async def main() -> None: ) invocation.append("-list-checks") invocation.append("-") - if args.quiet: - # Even with -quiet we still want to check if we can call clang-tidy. - with open(os.devnull, "w") as dev_null: - subprocess.check_call(invocation, stdout=dev_null) - else: - subprocess.check_call(invocation) + # Even with -quiet we still want to check if we can call clang-tidy. + subprocess.check_call( + invocation, stdout=subprocess.DEVNULL if args.quiet else None + ) except: print("Unable to run clang-tidy.", file=sys.stderr) sys.exit(1) diff --git a/clang/docs/tools/generate_formatted_state.py b/clang/docs/tools/generate_formatted_state.py index 66cebbf7af33a4..2de43dc383f557 100755 --- a/clang/docs/tools/generate_formatted_state.py +++ b/clang/docs/tools/generate_formatted_state.py @@ -78,8 +78,6 @@ def get_style(count, passed): - {style2}`{percent}%` """ -FNULL = open(os.devnull, "w") - with open(DOC_FILE, "wb") as output: cleanfiles = open(CLEAN_FILE, "wb") @@ -101,8 +99,8 @@ def get_style(count, passed): # interested in it, just the return code. git_check = subprocess.Popen( ["git", "ls-files", "--error-unmatch", act_sub_dir], - stdout=FNULL, - stderr=FNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, ) if git_check.wait() != 0: print("Skipping directory: ", act_sub_dir) diff --git a/clang/tools/scan-view/share/startfile.py b/clang/tools/scan-view/share/startfile.py index d63e69280e90dd..c72475e8b6212e 100644 --- a/clang/tools/scan-view/share/startfile.py +++ b/clang/tools/scan-view/share/startfile.py @@ -48,7 +48,7 @@ def _invoke(self, cmdline): or sys.platform[:3] == "win" or sys.platform == "darwin" ): - inout = file(os.devnull, "r+") + inout = subprocess.DEVNULL else: # for TTY programs, we need stdin/out inout = None diff --git a/clang/utils/creduce-clang-crash.py b/clang/utils/creduce-clang-crash.py index db4a3435a3aef7..180dfbeab224e9 100755 --- a/clang/utils/creduce-clang-crash.py +++ b/clang/utils/creduce-clang-crash.py @@ -8,7 +8,6 @@ *.test.sh -- interestingness test for C-Reduce """ -from __future__ import print_function from argparse import ArgumentParser, RawTextHelpFormatter import os import re @@ -228,8 +227,7 @@ def check_interestingness(self): testfile = os.path.abspath(self.testfile) # Check that the test considers the original file interesting - with open(os.devnull, "w") as devnull: - returncode = subprocess.call(testfile, stdout=devnull) + returncode = subprocess.call(testfile, stdout=subprocess.DEVNULL) if returncode: sys.exit("The interestingness test does not pass for the original file.") diff --git a/lldb/bindings/interface/SBErrorDocstrings.i b/lldb/bindings/interface/SBErrorDocstrings.i index b64c3d64c6c77b..c272ffb7605ffb 100644 --- a/lldb/bindings/interface/SBErrorDocstrings.i +++ b/lldb/bindings/interface/SBErrorDocstrings.i @@ -10,8 +10,10 @@ For example (from test/python_api/hello_world/TestHelloWorld.py), :: # Spawn a new process and don't display the stdout if not in TraceOn() mode. import subprocess - popen = subprocess.Popen([self.exe, 'abc', 'xyz'], - stdout = open(os.devnull, 'w') if not self.TraceOn() else None) + popen = subprocess.Popen( + [self.exe, 'abc', 'xyz'], + stdout=subprocess.DEVNULL if not self.TraceOn() else None, + ) listener = lldb.SBListener('my.attach.listener') error = lldb.SBError() diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py index 0e8ca159efd55d..834f01aaa61e6b 100644 --- a/lldb/packages/Python/lldbsuite/test/decorators.py +++ b/lldb/packages/Python/lldbsuite/test/decorators.py @@ -467,9 +467,8 @@ def should_skip_simulator_test(): if lldbplatformutil.getHostPlatform() not in ["darwin", "macosx"]: return "simulator tests are run only on darwin hosts." try: - DEVNULL = open(os.devnull, "w") output = subprocess.check_output( - ["xcodebuild", "-showsdks"], stderr=DEVNULL + ["xcodebuild", "-showsdks"], stderr=subprocess.DEVNULL ).decode("utf-8") if re.search("%ssimulator" % platform, output): return None @@ -1094,9 +1093,8 @@ def skipUnlessFeature(feature): def is_feature_enabled(): if platform.system() == "Darwin": try: - DEVNULL = open(os.devnull, "w") output = subprocess.check_output( - ["/usr/sbin/sysctl", feature], stderr=DEVNULL + ["/usr/sbin/sysctl", feature], stderr=subprocess.DEVNULL ).decode("utf-8") # If 'feature: 1' was output, then this feature is available and # the test should not be skipped. diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py index b57c3bdd87c83c..e0da7cbd1ddd6e 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbtest.py +++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py @@ -31,7 +31,6 @@ import abc from functools import wraps import gc -import glob import io import json import os.path @@ -416,7 +415,7 @@ def launch(self, executable, args, extra_env): self._proc = Popen( [executable] + args, - stdout=open(os.devnull) if not self._trace_on else None, + stdout=DEVNULL if not self._trace_on else None, stdin=PIPE, env=env, ) diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index c5e4ad4219c91d..9b9be69ee38448 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -1,11 +1,8 @@ -from __future__ import print_function - import argparse import bisect import collections import copy import glob -import itertools import os import re import subprocess @@ -517,12 +514,13 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False): sep="", file=sys.stderr, ) - # Python 2.7 doesn't have subprocess.DEVNULL: - with open(os.devnull, "w") as devnull: - pp = subprocess.Popen( - preprocess_cmd, shell=True, stdin=devnull, stdout=subprocess.PIPE - ) - ir_file = pp.stdout + pp = subprocess.Popen( + preprocess_cmd, + shell=True, + stdin=subprocess.DEVNULL, + stdout=subprocess.PIPE, + ) + ir_file = pp.stdout if isinstance(cmd_args, list): args = [applySubstitutions(a, substitutions) for a in cmd_args] diff --git a/llvm/utils/git/pre-push.py b/llvm/utils/git/pre-push.py index d7ae3767d2923d..dfa009dd1a6f62 100755 --- a/llvm/utils/git/pre-push.py +++ b/llvm/utils/git/pre-push.py @@ -27,7 +27,6 @@ """ import argparse -import os import shutil import subprocess import sys @@ -70,14 +69,6 @@ def ask_confirm(prompt): return query.lower() == "y" -def get_dev_null(): - """Lazily create a /dev/null fd for use in shell()""" - global dev_null_fd - if dev_null_fd is None: - dev_null_fd = open(os.devnull, "w") - return dev_null_fd - - def shell( cmd, strip=True, @@ -95,10 +86,8 @@ def shell( cwd_msg = " in %s" % cwd log_verbose("Running%s: %s" % (cwd_msg, " ".join(quoted_cmd))) - err_pipe = subprocess.PIPE - if ignore_errors: - # Silence errors if requested. - err_pipe = get_dev_null() + # Silence errors if requested. + err_pipe = subprocess.DEVNULL if ignore_errors else subprocess.PIPE start = time.time() p = subprocess.Popen( diff --git a/llvm/utils/gn/gn.py b/llvm/utils/gn/gn.py index 290c6941bceea2..6b7919b7faeb92 100755 --- a/llvm/utils/gn/gn.py +++ b/llvm/utils/gn/gn.py @@ -42,7 +42,7 @@ def main(): if ( subprocess.call( "gn --version", - stdout=open(os.devnull, "w"), + stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT, shell=True, ) From 079746d2c0804ddf616766eb525270d9c57ab542 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 30 Aug 2024 14:27:51 -0400 Subject: [PATCH 82/98] [SLP]Better cost estimation for masked gather or "clustered" loads. After landing support for actual vectorization of the "clustered" loads, need better estimate the cost between the masked gather and clustered loads. This includes estimation of the address calculation and better estimation of the gathered loads. Also, this estimation now relies on SLPCostThreshold option, allowing modify the behavior of the compiler. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/105858 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 225 ++++++++++++------ .../SLPVectorizer/X86/pr47629-inseltpoison.ll | 20 +- .../Transforms/SLPVectorizer/X86/pr47629.ll | 20 +- ...masked-loads-consecutive-loads-same-ptr.ll | 20 +- .../X86/reorder-possible-strided-node.ll | 72 ++++-- .../X86/reorder-reused-masked-gather2.ll | 26 +- 6 files changed, 257 insertions(+), 126 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f6a797b071b65c..8f5cbcaa8f66e3 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4820,16 +4820,68 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( } } } - auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) { + // Correctly identify compare the cost of loads + shuffles rather than + // strided/masked gather loads. Returns true if vectorized + shuffles + // representation is better than just gather. + auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment, + bool ProfitableGatherPointers) { + // Compare masked gather cost and loads + insert subvector costs. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + auto [ScalarGEPCost, VectorGEPCost] = + getGEPCosts(TTI, PointerOps, PointerOps.front(), + Instruction::GetElementPtr, CostKind, ScalarTy, VecTy); + // Estimate the cost of masked gather GEP. If not a splat, roughly + // estimate as a buildvector, otherwise estimate as splat. + APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements()); + VectorType *PtrVecTy = + getWidenedType(PointerOps.front()->getType()->getScalarType(), + VecTy->getNumElements()); + if (static_cast(count_if( + PointerOps, IsaPred)) < PointerOps.size() - 1 || + any_of(PointerOps, [&](Value *V) { + return getUnderlyingObject(V) != + getUnderlyingObject(PointerOps.front()); + })) + VectorGEPCost += TTI.getScalarizationOverhead( + PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind); + else + VectorGEPCost += + TTI.getScalarizationOverhead( + PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0), + /*Insert=*/true, /*Extract=*/false, CostKind) + + ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, std::nullopt, + CostKind); + // The cost of scalar loads. + InstructionCost ScalarLoadsCost = + std::accumulate(VL.begin(), VL.end(), InstructionCost(), + [&](InstructionCost C, Value *V) { + return C + TTI.getInstructionCost( + cast(V), CostKind); + }) + + ScalarGEPCost; + // The cost of masked gather. + InstructionCost MaskedGatherCost = + TTI.getGatherScatterOpCost( + Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, CostKind) + + (ProfitableGatherPointers ? 0 : VectorGEPCost); + InstructionCost GatherCost = + TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind) + + ScalarLoadsCost; + // The list of loads is small or perform partial check already - directly + // compare masked gather cost and gather cost. + constexpr unsigned ListLimit = 4; + if (!TryRecursiveCheck || VL.size() < ListLimit) + return MaskedGatherCost - GatherCost >= -SLPCostThreshold; unsigned Sz = DL->getTypeSizeInBits(ScalarTy); - unsigned MinVF = getMinVF(Sz); - unsigned MaxVF = std::max(bit_floor(VL.size() / 2), MinVF); - MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF); - for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) { - unsigned VectorizedCnt = 0; + unsigned MinVF = getMinVF(2 * Sz); + DemandedElts.clearAllBits(); + // Iterate through possible vectorization factors and check if vectorized + + // shuffles is better than just gather. + for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) { SmallVector States; - for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; - Cnt += VF, ++VectorizedCnt) { + for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) { ArrayRef Slice = VL.slice(Cnt, VF); SmallVector Order; SmallVector PointerOps; @@ -4837,8 +4889,10 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, /*TryRecursiveCheck=*/false); // Check that the sorted loads are consecutive. - if (LS == LoadsState::Gather) - break; + if (LS == LoadsState::Gather) { + DemandedElts.setBits(Cnt, Cnt + VF); + continue; + } // If need the reorder - consider as high-cost masked gather for now. if ((LS == LoadsState::Vectorize || LS == LoadsState::StridedVectorize) && @@ -4846,79 +4900,93 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( LS = LoadsState::ScatterVectorize; States.push_back(LS); } + if (DemandedElts.isAllOnes()) + // All loads gathered - try smaller VF. + continue; + InstructionCost ScalarVFGEPCost = 0; // Can be vectorized later as a serie of loads/insertelements. - if (VectorizedCnt == VL.size() / VF) { - // Compare masked gather cost and loads + insersubvector costs. - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - auto [ScalarGEPCost, VectorGEPCost] = - getGEPCosts(TTI, PointerOps, PointerOps.front(), - Instruction::GetElementPtr, CostKind, ScalarTy, VecTy); - InstructionCost MaskedGatherCost = - TTI.getGatherScatterOpCost(Instruction::Load, VecTy, - cast(VL0)->getPointerOperand(), - /*VariableMask=*/false, CommonAlignment, - CostKind) + - VectorGEPCost - ScalarGEPCost; - InstructionCost VecLdCost = 0; - auto *SubVecTy = getWidenedType(ScalarTy, VF); - for (auto [I, LS] : enumerate(States)) { - auto *LI0 = cast(VL[I * VF]); - switch (LS) { - case LoadsState::Vectorize: { - auto [ScalarGEPCost, VectorGEPCost] = - getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF), - LI0->getPointerOperand(), Instruction::Load, - CostKind, ScalarTy, SubVecTy); - VecLdCost += TTI.getMemoryOpCost( - Instruction::Load, SubVecTy, LI0->getAlign(), - LI0->getPointerAddressSpace(), CostKind, - TTI::OperandValueInfo()) + - VectorGEPCost - ScalarGEPCost; - break; - } - case LoadsState::StridedVectorize: { - auto [ScalarGEPCost, VectorGEPCost] = - getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF), - LI0->getPointerOperand(), Instruction::Load, - CostKind, ScalarTy, SubVecTy); - VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy, - LI0->getPointerOperand(), - /*VariableMask=*/false, - CommonAlignment, CostKind) + - VectorGEPCost - ScalarGEPCost; - break; - } - case LoadsState::ScatterVectorize: { - auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts( - TTI, ArrayRef(PointerOps).slice(I * VF, VF), - LI0->getPointerOperand(), Instruction::GetElementPtr, CostKind, - ScalarTy, SubVecTy); - VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy, - LI0->getPointerOperand(), - /*VariableMask=*/false, - CommonAlignment, CostKind) + - VectorGEPCost - ScalarGEPCost; - break; - } - case LoadsState::Gather: - llvm_unreachable( - "Expected only consecutive, strided or masked gather loads."); - } - SmallVector ShuffleMask(VL.size()); - for (int Idx : seq(0, VL.size())) - ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx; + InstructionCost VecLdCost = 0; + if (!DemandedElts.isZero()) { + VecLdCost = + TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind) + + ScalarGEPCost; + for (unsigned Idx : seq(VL.size())) + if (DemandedElts[Idx]) + VecLdCost += + TTI.getInstructionCost(cast(VL[Idx]), CostKind); + } + auto *SubVecTy = getWidenedType(ScalarTy, VF); + for (auto [I, LS] : enumerate(States)) { + auto *LI0 = cast(VL[I * VF]); + InstructionCost VectorGEPCost = + (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers) + ? 0 + : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF), + LI0->getPointerOperand(), + Instruction::GetElementPtr, CostKind, ScalarTy, + SubVecTy) + .second; + if (LS == LoadsState::ScatterVectorize) { + if (static_cast( + count_if(PointerOps, IsaPred)) < + PointerOps.size() - 1 || + any_of(PointerOps, [&](Value *V) { + return getUnderlyingObject(V) != + getUnderlyingObject(PointerOps.front()); + })) + VectorGEPCost += TTI.getScalarizationOverhead( + SubVecTy, APInt::getAllOnes(VF), + /*Insert=*/true, /*Extract=*/false, CostKind); + else + VectorGEPCost += TTI.getScalarizationOverhead( + SubVecTy, APInt::getOneBitSet(VF, 0), + /*Insert=*/true, /*Extract=*/false, CostKind) + + ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, + std::nullopt, CostKind); + } + switch (LS) { + case LoadsState::Vectorize: + VecLdCost += + TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(), + LI0->getPointerAddressSpace(), CostKind, + TTI::OperandValueInfo()) + + VectorGEPCost; + break; + case LoadsState::StridedVectorize: + VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy, + LI0->getPointerOperand(), + /*VariableMask=*/false, + CommonAlignment, CostKind) + + VectorGEPCost; + break; + case LoadsState::ScatterVectorize: + VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy, + LI0->getPointerOperand(), + /*VariableMask=*/false, + CommonAlignment, CostKind) + + VectorGEPCost; + break; + case LoadsState::Gather: + // Gathers are already calculated - ignore. + continue; + } + SmallVector ShuffleMask(VL.size()); + for (int Idx : seq(0, VL.size())) + ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx; + if (I > 0) VecLdCost += ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask, CostKind, I * VF, SubVecTy); - } - // If masked gather cost is higher - better to vectorize, so - // consider it as a gather node. It will be better estimated - // later. - if (MaskedGatherCost >= VecLdCost) - return true; } + // If masked gather cost is higher - better to vectorize, so + // consider it as a gather node. It will be better estimated + // later. + if (MaskedGatherCost >= VecLdCost && + VecLdCost - GatherCost < -SLPCostThreshold) + return true; } - return false; + return MaskedGatherCost - GatherCost >= -SLPCostThreshold; }; // TODO: need to improve analysis of the pointers, if not all of them are // GEPs or have > 2 operands, we end up with a gather node, which just @@ -4939,7 +5007,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) { // Check if potential masked gather can be represented as series // of loads + insertsubvectors. - if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) { + if (TryRecursiveCheck && + CheckForShuffledLoads(CommonAlignment, ProfitableGatherPointers)) { // If masked gather cost is higher - better to vectorize, so // consider it as a gather node. It will be better estimated // later. diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll index 5b33c6e889363e..89bc44dc1d530a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -180,12 +180,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x ptr> [[TMP4]], <4 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> , <4 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], -; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4 +; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll index 09d6c77557efaa..c1b501015e81e4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -180,12 +180,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x ptr> [[TMP4]], <4 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> , <4 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], -; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4 +; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll index 40dcc79f79ffce..09a5ace101e645 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll @@ -8,19 +8,23 @@ ; YAML-NEXT: Function: test ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-5' +; YAML-NEXT: - Cost: '-7' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '7' +; YAML-NEXT: - TreeSize: '5' define void @test(ptr noalias %p, ptr noalias %p1) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[P:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x ptr> [[TMP1]], <4 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> , <4 x i32> poison) -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[P]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[I:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr i32, ptr [[P]], i64 32 +; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr i32, ptr [[P]], i64 33 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[I]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[I2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP3]], <2 x i32> [[TMP0]], i64 2) +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP1]] ; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[P1:%.*]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll index cfbbe14186b501..eacfbda5447c7b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll @@ -5,13 +5,23 @@ define void @test() { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> ), i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]] ; CHECK-NEXT: [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP11]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16 ; CHECK-NEXT: ret void ; entry: @@ -57,15 +67,25 @@ define void @test1() { ; CHECK-LABEL: define void @test1( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> ), i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]] ; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i64> [[TMP5]] to <4 x i32> -; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP13]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16 ; CHECK-NEXT: ret void ; entry: @@ -111,12 +131,22 @@ define void @test_div() { ; CHECK-LABEL: define void @test_div( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> ), i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = udiv <4 x i32> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = udiv <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16 ; CHECK-NEXT: ret void ; @@ -163,12 +193,22 @@ define void @test_rem() { ; CHECK-LABEL: define void @test_rem( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> ), i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = urem <4 x i32> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = urem <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll index 30f328293cdaa3..c114c5dee78e99 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll @@ -6,18 +6,20 @@ target triple = "x86_64-unknown-linux-gnu" define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP0:%.*]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr addrspace(1)> [[TMP3]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <4 x ptr addrspace(1)> [[TMP4]], <4 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 8 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8 -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> , <4 x float> poison) -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP7]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd <8 x float> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: store <8 x float> [[TMP13]], ptr addrspace(1) [[TMP6]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 8 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 24 +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP3]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP5]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP4]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP8]], i64 2) +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = fmul <8 x float> [[TMP12]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x float> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: store <8 x float> [[TMP15]], ptr addrspace(1) [[TMP3]], align 4 ; CHECK-NEXT: ret void ; %3 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8 From 8a267b721180b172e329601039a7e170fa8aa5b5 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 30 Aug 2024 11:44:29 -0700 Subject: [PATCH 83/98] [SLP][NFC]Remove unused variable --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 8f5cbcaa8f66e3..848547c6ef3663 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4903,7 +4903,6 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( if (DemandedElts.isAllOnes()) // All loads gathered - try smaller VF. continue; - InstructionCost ScalarVFGEPCost = 0; // Can be vectorized later as a serie of loads/insertelements. InstructionCost VecLdCost = 0; if (!DemandedElts.isZero()) { From 688a27496d73881a9e793a61f3f3a879f7efd581 Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Fri, 30 Aug 2024 11:47:34 -0700 Subject: [PATCH 84/98] [PtrUseVisitor] Allow using Argument as a starting point (#106308) Argument is another possible starting point for the pointer traversal, and PtrUseVisitor should be able to handle it. --- llvm/include/llvm/Analysis/PtrUseVisitor.h | 7 +++++-- llvm/lib/Analysis/PtrUseVisitor.cpp | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h index f5c23b1b4e014d..237d328721609b 100644 --- a/llvm/include/llvm/Analysis/PtrUseVisitor.h +++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h @@ -157,7 +157,7 @@ class PtrUseVisitorBase { /// /// This will visit the users with the same offset of the current visit /// (including an unknown offset if that is the current state). - void enqueueUsers(Instruction &I); + void enqueueUsers(Value &I); /// Walk the operands of a GEP and adjust the offset as appropriate. /// @@ -208,11 +208,14 @@ class PtrUseVisitor : protected InstVisitor, /// Recursively visit the uses of the given pointer. /// \returns An info struct about the pointer. See \c PtrInfo for details. - PtrInfo visitPtr(Instruction &I) { + /// We may also need to process Argument pointers, so the input uses is + /// a common Value type. + PtrInfo visitPtr(Value &I) { // This must be a pointer type. Get an integer type suitable to hold // offsets on this pointer. // FIXME: Support a vector of pointers. assert(I.getType()->isPointerTy()); + assert(isa(I) || isa(I)); IntegerType *IntIdxTy = cast(DL.getIndexType(I.getType())); IsOffsetKnown = true; Offset = APInt(IntIdxTy->getBitWidth(), 0); diff --git a/llvm/lib/Analysis/PtrUseVisitor.cpp b/llvm/lib/Analysis/PtrUseVisitor.cpp index 49304818d7efed..9c79546f491eff 100644 --- a/llvm/lib/Analysis/PtrUseVisitor.cpp +++ b/llvm/lib/Analysis/PtrUseVisitor.cpp @@ -17,7 +17,7 @@ using namespace llvm; -void detail::PtrUseVisitorBase::enqueueUsers(Instruction &I) { +void detail::PtrUseVisitorBase::enqueueUsers(Value &I) { for (Use &U : I.uses()) { if (VisitedUses.insert(&U).second) { UseToVisit NewU = { From 6ab07d71174982e5cb95420ee4df01347333c342 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 30 Aug 2024 14:50:34 -0400 Subject: [PATCH 85/98] [SLP]Initial support for non-power-of-2 (but still whole register) number of elements in operands. Patch adds basic support for non-power-of-2 number of elements in operands. The patch still requires that this number addresses whole registers. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/106449 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 106 ++++++++++++------ .../RISCV/reduction-whole-regs-loads.ll | 14 +-- 2 files changed, 78 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 848547c6ef3663..3d41c978281351 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -260,6 +260,20 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) { VF * getNumElements(ScalarTy)); } +/// Returns the number of elements of the given type \p Ty, not less than \p Sz, +/// which forms type, which splits by \p TTI into whole vector types during +/// legalization. +static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, + Type *Ty, unsigned Sz) { + if (!isValidElementType(Ty)) + return PowerOf2Ceil(Sz); + // Find the number of elements, which forms full vectors. + const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz)); + if (NumParts == 0 || NumParts == Sz) + return PowerOf2Ceil(Sz); + return PowerOf2Ceil(divideCeil(Sz, NumParts)) * NumParts; +} + static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl &Mask) { // The ShuffleBuilder implementation use shufflevector to splat an "element". @@ -1224,6 +1238,22 @@ static bool doesNotNeedToSchedule(ArrayRef VL) { (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts)); } +/// Returns true if widened type of \p Ty elements with size \p Sz represents +/// full vector type, i.e. adding extra element results in extra parts upon type +/// legalization. +static bool hasFullVectorsOnly(const TargetTransformInfo &TTI, Type *Ty, + unsigned Sz) { + if (Sz <= 1) + return false; + if (!isValidElementType(Ty) && !isa(Ty)) + return false; + if (has_single_bit(Sz)) + return true; + const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz)); + return NumParts > 0 && NumParts != Sz && has_single_bit(Sz / NumParts) && + Sz % NumParts == 0; +} + namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -2467,7 +2497,9 @@ class BoUpSLP { } // TODO: Check if we can remove a check for non-power-2 number of // scalars after full support of non-power-2 vectorization. - return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size()); + return UniqueValues.size() != 2 && + hasFullVectorsOnly(*R.TTI, (*UniqueValues.begin())->getType(), + UniqueValues.size()); }; // If the initial strategy fails for any of the operand indexes, then we @@ -3276,8 +3308,9 @@ class BoUpSLP { SmallVectorImpl *AltScalars = nullptr) const; /// Return true if this is a non-power-of-2 node. - bool isNonPowOf2Vec() const { - bool IsNonPowerOf2 = !has_single_bit(Scalars.size()); + bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const { + bool IsNonPowerOf2 = !hasFullVectorsOnly( + TTI, getValueType(Scalars.front()), Scalars.size()); assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) && "Reshuffling not supported with non-power-of-2 vectors yet."); return IsNonPowerOf2; @@ -3455,7 +3488,7 @@ class BoUpSLP { if (UserTreeIdx.UserTE) { Last->UserTreeIndices.push_back(UserTreeIdx); - assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) && + assert((!Last->isNonPowOf2Vec(*TTI) || Last->ReorderIndices.empty()) && "Reordering isn't implemented for non-power-of-2 nodes yet"); } return Last; @@ -4361,7 +4394,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { if (!isValidElementType(ScalarTy)) return std::nullopt; auto *VecTy = getWidenedType(ScalarTy, NumScalars); - int NumParts = TTI->getNumberOfParts(VecTy); + int NumParts = TTI->getRegUsageForType(VecTy); if (NumParts == 0 || NumParts >= NumScalars) NumParts = 1; SmallVector ExtractMask; @@ -4733,7 +4766,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( // Check the order of pointer operands or that all pointers are the same. bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order); // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. - if (!Order.empty() && !has_single_bit(VL.size())) { + if (!Order.empty() && !hasFullVectorsOnly(*TTI, ScalarTy, Sz)) { assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only " "supported with VectorizeNonPowerOf2"); return LoadsState::Gather; @@ -4787,12 +4820,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( }); }); const unsigned AbsoluteDiff = std::abs(*Diff); - if (IsPossibleStrided && (IsAnyPointerUsedOutGraph || - ((Sz > MinProfitableStridedLoads || - (AbsoluteDiff <= MaxProfitableLoadStride * Sz && - has_single_bit(AbsoluteDiff))) && - AbsoluteDiff > Sz) || - *Diff == -(static_cast(Sz) - 1))) { + if (IsPossibleStrided && + (IsAnyPointerUsedOutGraph || + ((Sz > MinProfitableStridedLoads || + (AbsoluteDiff <= MaxProfitableLoadStride * Sz && + hasFullVectorsOnly(*TTI, ScalarTy, AbsoluteDiff))) && + AbsoluteDiff > Sz) || + *Diff == -(static_cast(Sz) - 1))) { int Stride = *Diff / static_cast(Sz - 1); if (*Diff == Stride * static_cast(Sz - 1)) { Align Alignment = @@ -5197,7 +5231,7 @@ static bool areTwoInsertFromSameBuildVector( std::optional BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. - if (TE.isNonPowOf2Vec()) + if (TE.isNonPowOf2Vec(*TTI)) return std::nullopt; // No need to reorder if need to shuffle reuses, still need to shuffle the @@ -5231,8 +5265,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { } } if (Sz == 2 && TE.getVectorFactor() == 4 && - TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(), - 2 * TE.getVectorFactor())) == 1) + TTI->getRegUsageForType(getWidenedType(TE.Scalars.front()->getType(), + 2 * TE.getVectorFactor())) == 1) return std::nullopt; if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, Sz)) { @@ -5581,7 +5615,7 @@ void BoUpSLP::reorderTopToBottom() { // Reorder the graph nodes according to their vectorization factor. for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1; - VF /= 2) { + VF -= 2) { auto It = VFToOrderedEntries.find(VF); if (It == VFToOrderedEntries.end()) continue; @@ -5754,7 +5788,7 @@ bool BoUpSLP::canReorderOperands( ArrayRef ReorderableGathers, SmallVectorImpl &GatherOps) { // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. - if (UserTE->isNonPowOf2Vec()) + if (UserTE->isNonPowOf2Vec(*TTI)) return false; for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) { @@ -5929,7 +5963,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); const auto AllowsReordering = [&](const TreeEntry *TE) { // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. - if (TE->isNonPowOf2Vec()) + if (TE->isNonPowOf2Vec(*TTI)) return false; if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || @@ -6575,7 +6609,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( case Instruction::ExtractElement: { bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. - if (!has_single_bit(VL.size())) + if (!hasFullVectorsOnly(*TTI, VL0->getType(), VL.size())) return TreeEntry::NeedToGather; if (Reuse || !CurrentOrder.empty()) return TreeEntry::Vectorize; @@ -6985,7 +7019,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, ReuseShuffleIndices.clear(); } else { // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops. - if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) { + if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec(*TTI)) { LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " "for nodes with padding.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); @@ -6998,7 +7032,8 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, return isa(V) || !isConstant(V); })) || - !llvm::has_single_bit(NumUniqueScalarValues)) { + !hasFullVectorsOnly(*TTI, UniqueValues.front()->getType(), + NumUniqueScalarValues)) { if (DoNotFail && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() && all_of(UniqueValues, [=](Value *V) { @@ -7006,7 +7041,9 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, areAllUsersVectorized(cast(V), UserIgnoreList); })) { - unsigned PWSz = PowerOf2Ceil(UniqueValues.size()); + // Find the number of elements, which forms full vectors. + unsigned PWSz = getFullVectorNumberOfElements( + *TTI, UniqueValues.front()->getType(), UniqueValues.size()); if (PWSz == VL.size()) { ReuseShuffleIndices.clear(); } else { @@ -9217,7 +9254,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } assert(!CommonMask.empty() && "Expected non-empty common mask."); auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size()); - unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); + unsigned NumParts = TTI.getRegUsageForType(MaskVecTy); if (NumParts == 0 || NumParts >= Mask.size()) NumParts = 1; unsigned SliceSize = getPartNumElems(Mask.size(), NumParts); @@ -9234,7 +9271,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } assert(!CommonMask.empty() && "Expected non-empty common mask."); auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size()); - unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); + unsigned NumParts = TTI.getRegUsageForType(MaskVecTy); if (NumParts == 0 || NumParts >= Mask.size()) NumParts = 1; unsigned SliceSize = getPartNumElems(Mask.size(), NumParts); @@ -9740,7 +9777,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, unsigned const NumElts = SrcVecTy->getNumElements(); unsigned const NumScalars = VL.size(); - unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy); + unsigned NumOfParts = TTI->getRegUsageForType(SrcVecTy); SmallVector InsertMask(NumElts, PoisonMaskElem); unsigned OffsetBeg = *getElementIndex(VL.front()); @@ -10956,7 +10993,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { // Keep original scalar if number of externally used instructions in // the same entry is not power of 2. It may help to do some extra // vectorization for now. - KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount); + KeepScalar = + ScalarUsesCount <= 1 || + !hasFullVectorsOnly(*TTI, EU.Scalar->getType(), ScalarUsesCount); } if (KeepScalar) { ExternalUsesAsOriginalScalar.insert(EU.Scalar); @@ -11649,13 +11688,14 @@ BoUpSLP::isGatherShuffledEntry( if (TE == VectorizableTree.front().get()) return {}; // FIXME: Gathering for non-power-of-2 nodes not implemented yet. - if (TE->isNonPowOf2Vec()) + if (TE->isNonPowOf2Vec(*TTI)) return {}; Mask.assign(VL.size(), PoisonMaskElem); assert(TE->UserTreeIndices.size() == 1 && "Expected only single user of the gather node."); - assert(VL.size() % NumParts == 0 && - "Number of scalars must be divisible by NumParts."); + // Number of scalars must be divisible by NumParts. + if (VL.size() % NumParts != 0) + return {}; unsigned SliceSize = getPartNumElems(VL.size(), NumParts); SmallVector> Res; for (unsigned Part : seq(NumParts)) { @@ -12794,7 +12834,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, SmallVector> Entries; Type *OrigScalarTy = GatheredScalars.front()->getType(); auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size()); - unsigned NumParts = TTI->getNumberOfParts(VecTy); + unsigned NumParts = TTI->getRegUsageForType(VecTy); if (NumParts == 0 || NumParts >= GatheredScalars.size()) NumParts = 1; if (!all_of(GatheredScalars, IsaPred)) { @@ -16040,7 +16080,7 @@ void BoUpSLP::computeMinimumValueSizes() { [&](Value *V) { return AnalyzedMinBWVals.contains(V); })) return 0u; - unsigned NumParts = TTI->getNumberOfParts( + unsigned NumParts = TTI->getRegUsageForType( getWidenedType(TreeRootIT, VF * ScalarTyNumElements)); // The maximum bit width required to represent all the values that can be @@ -16097,7 +16137,7 @@ void BoUpSLP::computeMinimumValueSizes() { // use - ignore it. if (NumParts > 1 && NumParts == - TTI->getNumberOfParts(getWidenedType( + TTI->getRegUsageForType(getWidenedType( IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF))) return 0u; @@ -16958,7 +16998,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, for (unsigned I = NextInst; I < MaxInst; ++I) { unsigned ActualVF = std::min(MaxInst - I, VF); - if (!has_single_bit(ActualVF)) + if (!hasFullVectorsOnly(*TTI, ScalarTy, ActualVF)) continue; if (MaxVFOnly && ActualVF < MaxVF) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll index 54dc33dbc0d00b..c9a3158acdda34 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll @@ -4,15 +4,11 @@ define i64 @test(ptr %p) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4) -; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) -; CHECK-NEXT: ret i64 [[TMP6]] +; CHECK-NEXT: [[TMP0:%.*]] = load <6 x i64>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i64> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]]) +; CHECK-NEXT: ret i64 [[TMP3]] ; entry: %arrayidx.1 = getelementptr inbounds i64, ptr %p, i64 1 From 897b00f3c563dd3f7b8f7263c41eaebb3520ec86 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 30 Aug 2024 11:55:04 -0700 Subject: [PATCH 86/98] Reuse getBinOpIdentity in createAnyOfTargetReduction [nfc] Consolidating code so that we have one copy instead of multiple reasoning about identity element. Note that we're (deliberately) not passing the FMF flags to common utility to preserve behavior in this change. --- llvm/lib/Transforms/Utils/LoopUtils.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index a49d3b0b990bc7..8a8d8afece6cb4 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1210,6 +1210,11 @@ Value *llvm::createAnyOfTargetReduction(IRBuilderBase &Builder, Value *Src, Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src, RecurKind RdxKind) { auto *SrcVecEltTy = cast(Src->getType())->getElementType(); + auto getIdentity = [&]() { + Intrinsic::ID ID = getReductionIntrinsicID(RdxKind); + unsigned Opc = getArithmeticReductionInstruction(ID); + return ConstantExpr::getBinOpIdentity(Opc, SrcVecEltTy); + }; switch (RdxKind) { case RecurKind::Add: case RecurKind::Mul: @@ -1227,10 +1232,9 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src, return Builder.CreateUnaryIntrinsic(getReductionIntrinsicID(RdxKind), Src); case RecurKind::FMulAdd: case RecurKind::FAdd: - return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy), - Src); + return Builder.CreateFAddReduce(getIdentity(), Src); case RecurKind::FMul: - return Builder.CreateFMulReduce(ConstantFP::get(SrcVecEltTy, 1.0), Src); + return Builder.CreateFMulReduce(getIdentity(), Src); default: llvm_unreachable("Unhandled opcode"); } From 5eda4988117021b36ebe01b49082f63365846507 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 30 Aug 2024 12:34:41 -0700 Subject: [PATCH 87/98] Revert "[mlir][Transforms] Dialect conversion: Make materializations optional" (#106778) Reverts llvm/llvm-project#104668 This commit triggers an edge case that can cause circular `unrealized_conversion_cast` ops. https://github.com/llvm/llvm-project/pull/106760 may fix it, but it is has other issues. Reverting this PR for now, until I find a solution for that problem. --- .../mlir/Transforms/DialectConversion.h | 11 - .../Transforms/Utils/DialectConversion.cpp | 393 +++++++++++++----- .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir | 5 +- .../Transforms/finalizing-bufferize.mlir | 1 - .../test-legalize-type-conversion.mlir | 6 +- 5 files changed, 298 insertions(+), 118 deletions(-) diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 5f680e8eca7559..60113bdef16a23 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -1124,17 +1124,6 @@ struct ConversionConfig { // already been modified) and iterators into past IR state cannot be // represented at the moment. RewriterBase::Listener *listener = nullptr; - - /// If set to "true", the dialect conversion attempts to build source/target/ - /// argument materializations through the type converter API in lieu of - /// builtin.unrealized_conversion_cast ops. The conversion process fails if - /// at least one materialization could not be built. - /// - /// If set to "false", the dialect conversion does not does not build any - /// custom materializations and instead inserts - /// builtin.unrealized_conversion_cast ops to ensure that the resulting IR - /// is valid. - bool buildMaterializations = true; }; //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index cc9c9495e5155c..b23fb97959ed67 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -702,12 +702,14 @@ class UnresolvedMaterializationRewrite : public OperationRewrite { return rewrite->getKind() == Kind::UnresolvedMaterialization; } - void rollback() override; - UnrealizedConversionCastOp getOperation() const { return cast(op); } + void rollback() override; + + void cleanup(RewriterBase &rewriter) override; + /// Return the type converter of this materialization (which may be null). const TypeConverter *getConverter() const { return converterAndKind.getPointer(); @@ -764,7 +766,7 @@ namespace detail { struct ConversionPatternRewriterImpl : public RewriterBase::Listener { explicit ConversionPatternRewriterImpl(MLIRContext *ctx, const ConversionConfig &config) - : context(ctx), eraseRewriter(ctx), config(config) {} + : context(ctx), config(config) {} //===--------------------------------------------------------------------===// // State Management @@ -832,7 +834,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { //===--------------------------------------------------------------------===// // Materializations //===--------------------------------------------------------------------===// - /// Build an unresolved materialization operation given an output type and set /// of input operands. Value buildUnresolvedMaterialization(MaterializationKind kind, @@ -881,7 +882,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// Erase the given op (unless it was already erased). void eraseOp(Operation *op) override { - if (wasErased(op)) + if (erased.contains(op)) return; op->dropAllUses(); RewriterBase::eraseOp(op); @@ -889,24 +890,17 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// Erase the given block (unless it was already erased). void eraseBlock(Block *block) override { - if (wasErased(block)) + if (erased.contains(block)) return; assert(block->empty() && "expected empty block"); block->dropAllDefinedValueUses(); RewriterBase::eraseBlock(block); } - bool wasErased(void *ptr) const { return erased.contains(ptr); } - - bool wasErased(OperationRewrite *rewrite) const { - return wasErased(rewrite->getOperation()); - } - void notifyOperationErased(Operation *op) override { erased.insert(op); } void notifyBlockErased(Block *block) override { erased.insert(block); } - private: /// Pointers to all erased operations and blocks. DenseSet erased; }; @@ -918,11 +912,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// MLIR context. MLIRContext *context; - /// A rewriter that keeps track of ops/block that were already erased and - /// skips duplicate op/block erasures. This rewriter is used during the - /// "cleanup" phase. - SingleEraseRewriter eraseRewriter; - // Mapping between replaced values that differ in type. This happens when // replacing a value with one of a different type. ConversionValueMapping mapping; @@ -1069,6 +1058,10 @@ void UnresolvedMaterializationRewrite::rollback() { op->erase(); } +void UnresolvedMaterializationRewrite::cleanup(RewriterBase &rewriter) { + rewriter.eraseOp(op); +} + void ConversionPatternRewriterImpl::applyRewrites() { // Commit all rewrites. IRRewriter rewriter(context, config.listener); @@ -1076,6 +1069,7 @@ void ConversionPatternRewriterImpl::applyRewrites() { rewrite->commit(rewriter); // Clean up all rewrites. + SingleEraseRewriter eraseRewriter(context); for (auto &rewrite : rewrites) rewrite->cleanup(eraseRewriter); } @@ -2359,6 +2353,12 @@ struct OperationConverter { ConversionPatternRewriterImpl &rewriterImpl, DenseMap> &inverseMapping); + /// Legalize any unresolved type materializations. + LogicalResult legalizeUnresolvedMaterializations( + ConversionPatternRewriter &rewriter, + ConversionPatternRewriterImpl &rewriterImpl, + DenseMap> &inverseMapping); + /// Legalize an operation result that was marked as "erased". LogicalResult legalizeErasedResult(Operation *op, OpResult result, @@ -2405,56 +2405,6 @@ LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter, return success(); } -static LogicalResult -legalizeUnresolvedMaterialization(RewriterBase &rewriter, - UnresolvedMaterializationRewrite *rewrite) { - UnrealizedConversionCastOp op = rewrite->getOperation(); - assert(!op.use_empty() && - "expected that dead materializations have already been DCE'd"); - Operation::operand_range inputOperands = op.getOperands(); - Type outputType = op.getResultTypes()[0]; - - // Try to materialize the conversion. - if (const TypeConverter *converter = rewrite->getConverter()) { - rewriter.setInsertionPoint(op); - Value newMaterialization; - switch (rewrite->getMaterializationKind()) { - case MaterializationKind::Argument: - // Try to materialize an argument conversion. - newMaterialization = converter->materializeArgumentConversion( - rewriter, op->getLoc(), outputType, inputOperands); - if (newMaterialization) - break; - // If an argument materialization failed, fallback to trying a target - // materialization. - [[fallthrough]]; - case MaterializationKind::Target: - newMaterialization = converter->materializeTargetConversion( - rewriter, op->getLoc(), outputType, inputOperands); - break; - case MaterializationKind::Source: - newMaterialization = converter->materializeSourceConversion( - rewriter, op->getLoc(), outputType, inputOperands); - break; - } - if (newMaterialization) { - assert(newMaterialization.getType() == outputType && - "materialization callback produced value of incorrect type"); - rewriter.replaceOp(op, newMaterialization); - return success(); - } - } - - InFlightDiagnostic diag = op->emitError() - << "failed to legalize unresolved materialization " - "from (" - << inputOperands.getTypes() << ") to " << outputType - << " that remained live after conversion"; - diag.attachNote(op->getUsers().begin()->getLoc()) - << "see existing live user here: " << *op->getUsers().begin(); - return failure(); -} - LogicalResult OperationConverter::convertOperations(ArrayRef ops) { if (ops.empty()) return success(); @@ -2496,37 +2446,6 @@ LogicalResult OperationConverter::convertOperations(ArrayRef ops) { } else { rewriterImpl.applyRewrites(); } - - // Gather all unresolved materializations. - SmallVector allCastOps; - DenseMap rewriteMap; - for (std::unique_ptr &rewrite : rewriterImpl.rewrites) { - auto *mat = dyn_cast(rewrite.get()); - if (!mat) - continue; - if (rewriterImpl.eraseRewriter.wasErased(mat)) - continue; - allCastOps.push_back(mat->getOperation()); - rewriteMap[mat->getOperation()] = mat; - } - - // Reconcile all UnrealizedConversionCastOps that were inserted by the - // dialect conversion frameworks. (Not the one that were inserted by - // patterns.) - SmallVector remainingCastOps; - reconcileUnrealizedCasts(allCastOps, &remainingCastOps); - - // Try to legalize all unresolved materializations. - if (config.buildMaterializations) { - IRRewriter rewriter(rewriterImpl.context, config.listener); - for (UnrealizedConversionCastOp castOp : remainingCastOps) { - auto it = rewriteMap.find(castOp.getOperation()); - assert(it != rewriteMap.end() && "inconsistent state"); - if (failed(legalizeUnresolvedMaterialization(rewriter, it->second))) - return failure(); - } - } - return success(); } @@ -2540,6 +2459,9 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) { if (failed(legalizeConvertedOpResultTypes(rewriter, rewriterImpl, inverseMapping))) return failure(); + if (failed(legalizeUnresolvedMaterializations(rewriter, rewriterImpl, + inverseMapping))) + return failure(); return success(); } @@ -2655,6 +2577,279 @@ LogicalResult OperationConverter::legalizeConvertedArgumentTypes( return success(); } +/// Replace the results of a materialization operation with the given values. +static void +replaceMaterialization(ConversionPatternRewriterImpl &rewriterImpl, + ResultRange matResults, ValueRange values, + DenseMap> &inverseMapping) { + matResults.replaceAllUsesWith(values); + + // For each of the materialization results, update the inverse mappings to + // point to the replacement values. + for (auto [matResult, newValue] : llvm::zip(matResults, values)) { + auto inverseMapIt = inverseMapping.find(matResult); + if (inverseMapIt == inverseMapping.end()) + continue; + + // Update the reverse mapping, or remove the mapping if we couldn't update + // it. Not being able to update signals that the mapping would have become + // circular (i.e. %foo -> newValue -> %foo), which may occur as values are + // propagated through temporary materializations. We simply drop the + // mapping, and let the post-conversion replacement logic handle updating + // uses. + for (Value inverseMapVal : inverseMapIt->second) + if (!rewriterImpl.mapping.tryMap(inverseMapVal, newValue)) + rewriterImpl.mapping.erase(inverseMapVal); + } +} + +/// Compute all of the unresolved materializations that will persist beyond the +/// conversion process, and require inserting a proper user materialization for. +static void computeNecessaryMaterializations( + DenseMap + &materializationOps, + ConversionPatternRewriter &rewriter, + ConversionPatternRewriterImpl &rewriterImpl, + DenseMap> &inverseMapping, + SetVector &necessaryMaterializations) { + // Helper function to check if the given value or a not yet materialized + // replacement of the given value is live. + // Note: `inverseMapping` maps from replaced values to original values. + auto isLive = [&](Value value) { + auto findFn = [&](Operation *user) { + auto matIt = materializationOps.find(user); + if (matIt != materializationOps.end()) + return !necessaryMaterializations.count(matIt->second); + return rewriterImpl.isOpIgnored(user); + }; + // A worklist is needed because a value may have gone through a chain of + // replacements and each of the replaced values may have live users. + SmallVector worklist; + worklist.push_back(value); + while (!worklist.empty()) { + Value next = worklist.pop_back_val(); + if (llvm::find_if_not(next.getUsers(), findFn) != next.user_end()) + return true; + // This value may be replacing another value that has a live user. + llvm::append_range(worklist, inverseMapping.lookup(next)); + } + return false; + }; + + llvm::unique_function lookupRemappedValue = + [&](Value invalidRoot, Value value, Type type) { + // Check to see if the input operation was remapped to a variant of the + // output. + Value remappedValue = rewriterImpl.mapping.lookupOrDefault(value, type); + if (remappedValue.getType() == type && remappedValue != invalidRoot) + return remappedValue; + + // Check to see if the input is a materialization operation that + // provides an inverse conversion. We just check blindly for + // UnrealizedConversionCastOp here, but it has no effect on correctness. + auto inputCastOp = value.getDefiningOp(); + if (inputCastOp && inputCastOp->getNumOperands() == 1) + return lookupRemappedValue(invalidRoot, inputCastOp->getOperand(0), + type); + + return Value(); + }; + + SetVector worklist; + for (auto &rewrite : rewriterImpl.rewrites) { + auto *mat = dyn_cast(rewrite.get()); + if (!mat) + continue; + materializationOps.try_emplace(mat->getOperation(), mat); + worklist.insert(mat); + } + while (!worklist.empty()) { + UnresolvedMaterializationRewrite *mat = worklist.pop_back_val(); + UnrealizedConversionCastOp op = mat->getOperation(); + + // We currently only handle target materializations here. + assert(op->getNumResults() == 1 && "unexpected materialization type"); + OpResult opResult = op->getOpResult(0); + Type outputType = opResult.getType(); + Operation::operand_range inputOperands = op.getOperands(); + + // Try to forward propagate operands for user conversion casts that result + // in the input types of the current cast. + for (Operation *user : llvm::make_early_inc_range(opResult.getUsers())) { + auto castOp = dyn_cast(user); + if (!castOp) + continue; + if (castOp->getResultTypes() == inputOperands.getTypes()) { + replaceMaterialization(rewriterImpl, user->getResults(), inputOperands, + inverseMapping); + necessaryMaterializations.remove(materializationOps.lookup(user)); + } + } + + // Try to avoid materializing a resolved materialization if possible. + // Handle the case of a 1-1 materialization. + if (inputOperands.size() == 1) { + // Check to see if the input operation was remapped to a variant of the + // output. + Value remappedValue = + lookupRemappedValue(opResult, inputOperands[0], outputType); + if (remappedValue && remappedValue != opResult) { + replaceMaterialization(rewriterImpl, opResult, remappedValue, + inverseMapping); + necessaryMaterializations.remove(mat); + continue; + } + } else { + // TODO: Avoid materializing other types of conversions here. + } + + // If the materialization does not have any live users, we don't need to + // generate a user materialization for it. + bool isMaterializationLive = isLive(opResult); + if (!isMaterializationLive) + continue; + if (!necessaryMaterializations.insert(mat)) + continue; + + // Reprocess input materializations to see if they have an updated status. + for (Value input : inputOperands) { + if (auto parentOp = input.getDefiningOp()) { + if (auto *mat = materializationOps.lookup(parentOp)) + worklist.insert(mat); + } + } + } +} + +/// Legalize the given unresolved materialization. Returns success if the +/// materialization was legalized, failure otherise. +static LogicalResult legalizeUnresolvedMaterialization( + UnresolvedMaterializationRewrite &mat, + DenseMap + &materializationOps, + ConversionPatternRewriter &rewriter, + ConversionPatternRewriterImpl &rewriterImpl, + DenseMap> &inverseMapping) { + auto findLiveUser = [&](auto &&users) { + auto liveUserIt = llvm::find_if_not( + users, [&](Operation *user) { return rewriterImpl.isOpIgnored(user); }); + return liveUserIt == users.end() ? nullptr : *liveUserIt; + }; + + llvm::unique_function lookupRemappedValue = + [&](Value value, Type type) { + // Check to see if the input operation was remapped to a variant of the + // output. + Value remappedValue = rewriterImpl.mapping.lookupOrDefault(value, type); + if (remappedValue.getType() == type) + return remappedValue; + return Value(); + }; + + UnrealizedConversionCastOp op = mat.getOperation(); + if (!rewriterImpl.ignoredOps.insert(op)) + return success(); + + // We currently only handle target materializations here. + OpResult opResult = op->getOpResult(0); + Operation::operand_range inputOperands = op.getOperands(); + Type outputType = opResult.getType(); + + // If any input to this materialization is another materialization, resolve + // the input first. + for (Value value : op->getOperands()) { + auto valueCast = value.getDefiningOp(); + if (!valueCast) + continue; + + auto matIt = materializationOps.find(valueCast); + if (matIt != materializationOps.end()) + if (failed(legalizeUnresolvedMaterialization( + *matIt->second, materializationOps, rewriter, rewriterImpl, + inverseMapping))) + return failure(); + } + + // Perform a last ditch attempt to avoid materializing a resolved + // materialization if possible. + // Handle the case of a 1-1 materialization. + if (inputOperands.size() == 1) { + // Check to see if the input operation was remapped to a variant of the + // output. + Value remappedValue = lookupRemappedValue(inputOperands[0], outputType); + if (remappedValue && remappedValue != opResult) { + replaceMaterialization(rewriterImpl, opResult, remappedValue, + inverseMapping); + return success(); + } + } else { + // TODO: Avoid materializing other types of conversions here. + } + + // Try to materialize the conversion. + if (const TypeConverter *converter = mat.getConverter()) { + rewriter.setInsertionPoint(op); + Value newMaterialization; + switch (mat.getMaterializationKind()) { + case MaterializationKind::Argument: + // Try to materialize an argument conversion. + newMaterialization = converter->materializeArgumentConversion( + rewriter, op->getLoc(), outputType, inputOperands); + if (newMaterialization) + break; + // If an argument materialization failed, fallback to trying a target + // materialization. + [[fallthrough]]; + case MaterializationKind::Target: + newMaterialization = converter->materializeTargetConversion( + rewriter, op->getLoc(), outputType, inputOperands); + break; + case MaterializationKind::Source: + newMaterialization = converter->materializeSourceConversion( + rewriter, op->getLoc(), outputType, inputOperands); + break; + } + if (newMaterialization) { + assert(newMaterialization.getType() == outputType && + "materialization callback produced value of incorrect type"); + replaceMaterialization(rewriterImpl, opResult, newMaterialization, + inverseMapping); + return success(); + } + } + + InFlightDiagnostic diag = op->emitError() + << "failed to legalize unresolved materialization " + "from (" + << inputOperands.getTypes() << ") to " << outputType + << " that remained live after conversion"; + if (Operation *liveUser = findLiveUser(op->getUsers())) { + diag.attachNote(liveUser->getLoc()) + << "see existing live user here: " << *liveUser; + } + return failure(); +} + +LogicalResult OperationConverter::legalizeUnresolvedMaterializations( + ConversionPatternRewriter &rewriter, + ConversionPatternRewriterImpl &rewriterImpl, + DenseMap> &inverseMapping) { + // As an initial step, compute all of the inserted materializations that we + // expect to persist beyond the conversion process. + DenseMap materializationOps; + SetVector necessaryMaterializations; + computeNecessaryMaterializations(materializationOps, rewriter, rewriterImpl, + inverseMapping, necessaryMaterializations); + + // Once computed, legalize any necessary materializations. + for (auto *mat : necessaryMaterializations) { + if (failed(legalizeUnresolvedMaterialization( + *mat, materializationOps, rewriter, rewriterImpl, inverseMapping))) + return failure(); + } + return success(); +} + LogicalResult OperationConverter::legalizeErasedResult( Operation *op, OpResult result, ConversionPatternRewriterImpl &rewriterImpl) { diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index 75362378daaaaa..156a8a468d5b42 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -1286,6 +1286,7 @@ func.func @warpgroup_matrix_multiply_m128n128k64( // CHECK-DAG: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[arg0]] : !nvgpu.warpgroup.descriptor> to i64 // CHECK-DAG: %[[S1:.+]] = builtin.unrealized_conversion_cast %[[arg1]] : !nvgpu.warpgroup.descriptor> to i64 +// CHECK-DAG: %[[S2:.+]] = builtin.unrealized_conversion_cast %[[arg2]] : memref<128x128xf32, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[S3:.+]] = llvm.mlir.constant(0.000000e+00 : f32) : f32 // CHECK: %[[S4:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> // CHECK: %[[S5:.+]] = llvm.extractvalue %[[S4]][0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> @@ -1298,8 +1299,8 @@ func.func @warpgroup_matrix_multiply_m128n128k64( // CHECK: %[[S136:.+]] = llvm.insertvalue %[[S134]], %[[S135]][1] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> // CHECK: nvvm.wgmma.fence.aligned // CHECK: %[[S137:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> -// CHECK: %[[S138:.+]] = llvm.extractvalue %{{.*}}[0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> -// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %[[S0]], %[[S1]], %[[S138]], , D[, , ], A[, , ], B[, , ] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> +// CHECK: %[[S138:.+]] = llvm.extractvalue %136[0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> +// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %[[S0]], %1, %[[S138]], , D[, , ], A[, , ], B[, , ] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> // CHECK: nvvm.wgmma.mma_async // CHECK: nvvm.wgmma.mma_async // CHECK: %[[S154:.+]] = nvvm.wgmma.mma_async diff --git a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir index ab18ce05e355d3..a192434c5accf8 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir @@ -80,7 +80,6 @@ func.func @no_layout_to_dyn_layout_cast(%m: memref) -> memref // expected-error @+1 {{failed to legalize unresolved materialization from ('memref') to 'memref>' that remained live after conversion}} %1 = bufferization.to_memref %0 : memref> - // expected-note @below{{see existing live user here}} return %1 : memref> } diff --git a/mlir/test/Transforms/test-legalize-type-conversion.mlir b/mlir/test/Transforms/test-legalize-type-conversion.mlir index f130adff42f8cd..cf2c9f6a8ec441 100644 --- a/mlir/test/Transforms/test-legalize-type-conversion.mlir +++ b/mlir/test/Transforms/test-legalize-type-conversion.mlir @@ -4,7 +4,6 @@ func.func @test_invalid_arg_materialization( // expected-error@below {{failed to legalize unresolved materialization from () to 'i16' that remained live after conversion}} %arg0: i16) { - // expected-note@below{{see existing live user here}} "foo.return"(%arg0) : (i16) -> () } @@ -23,7 +22,6 @@ func.func @test_valid_arg_materialization(%arg0: i64) { func.func @test_invalid_result_materialization() { // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}} %result = "test.type_producer"() : () -> f16 - // expected-note@below{{see existing live user here}} "foo.return"(%result) : (f16) -> () } @@ -32,7 +30,6 @@ func.func @test_invalid_result_materialization() { func.func @test_invalid_result_materialization() { // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}} %result = "test.type_producer"() : () -> f16 - // expected-note@below{{see existing live user here}} "foo.return"(%result) : (f16) -> () } @@ -52,7 +49,6 @@ func.func @test_transitive_use_materialization() { func.func @test_transitive_use_invalid_materialization() { // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}} %result = "test.another_type_producer"() : () -> f16 - // expected-note@below{{see existing live user here}} "foo.return"(%result) : (f16) -> () } @@ -103,9 +99,9 @@ func.func @test_block_argument_not_converted() { func.func @test_signature_conversion_no_converter() { "test.signature_conversion_no_converter"() ({ // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f32' that remained live after conversion}} + // expected-note@below {{see existing live user here}} ^bb0(%arg0: f32): "test.type_consumer"(%arg0) : (f32) -> () - // expected-note@below{{see existing live user here}} "test.return"(%arg0) : (f32) -> () }) : () -> () return From c315d787e3680e7f48d9de0502bb83300b190f84 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 30 Aug 2024 12:25:50 -0700 Subject: [PATCH 88/98] [VP] Reduce duplicate code in vp.reduce expansions Primary goal is having one way of doing this, to ensure that we don't end up with accidental divergence. --- llvm/lib/CodeGen/ExpandVectorPredication.cpp | 65 +++++--------------- 1 file changed, 15 insertions(+), 50 deletions(-) diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp index 675d88d6d38cd9..5140f5951d6d3f 100644 --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -26,6 +26,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include using namespace llvm; @@ -437,69 +438,33 @@ CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder, default: llvm_unreachable("Impossible reduction kind"); case Intrinsic::vp_reduce_add: - Reduction = Builder.CreateAddReduce(RedOp); - Reduction = Builder.CreateAdd(Reduction, Start); - break; case Intrinsic::vp_reduce_mul: - Reduction = Builder.CreateMulReduce(RedOp); - Reduction = Builder.CreateMul(Reduction, Start); - break; case Intrinsic::vp_reduce_and: - Reduction = Builder.CreateAndReduce(RedOp); - Reduction = Builder.CreateAnd(Reduction, Start); - break; case Intrinsic::vp_reduce_or: - Reduction = Builder.CreateOrReduce(RedOp); - Reduction = Builder.CreateOr(Reduction, Start); - break; - case Intrinsic::vp_reduce_xor: - Reduction = Builder.CreateXorReduce(RedOp); - Reduction = Builder.CreateXor(Reduction, Start); - break; - case Intrinsic::vp_reduce_smax: - Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ true); + case Intrinsic::vp_reduce_xor: { + Intrinsic::ID RedID = *VPI.getFunctionalIntrinsicID(); + unsigned Opc = getArithmeticReductionInstruction(RedID); + assert(Instruction::isBinaryOp(Opc)); + Reduction = Builder.CreateUnaryIntrinsic(RedID, RedOp); Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::smax, Reduction, Start); + Builder.CreateBinOp((Instruction::BinaryOps)Opc, Reduction, Start); break; + } + case Intrinsic::vp_reduce_smax: case Intrinsic::vp_reduce_smin: - Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ true); - Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::smin, Reduction, Start); - break; case Intrinsic::vp_reduce_umax: - Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ false); - Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::umax, Reduction, Start); - break; case Intrinsic::vp_reduce_umin: - Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ false); - Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::umin, Reduction, Start); - break; case Intrinsic::vp_reduce_fmax: - Reduction = Builder.CreateFPMaxReduce(RedOp); - transferDecorations(*Reduction, VPI); - Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, Reduction, Start); - break; case Intrinsic::vp_reduce_fmin: - Reduction = Builder.CreateFPMinReduce(RedOp); - transferDecorations(*Reduction, VPI); - Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::minnum, Reduction, Start); - break; case Intrinsic::vp_reduce_fmaximum: - Reduction = Builder.CreateFPMaximumReduce(RedOp); - transferDecorations(*Reduction, VPI); - Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::maximum, Reduction, Start); - break; - case Intrinsic::vp_reduce_fminimum: - Reduction = Builder.CreateFPMinimumReduce(RedOp); + case Intrinsic::vp_reduce_fminimum: { + Intrinsic::ID RedID = *VPI.getFunctionalIntrinsicID(); + Intrinsic::ID ScalarID = getMinMaxReductionIntrinsicOp(RedID); + Reduction = Builder.CreateUnaryIntrinsic(RedID, RedOp); transferDecorations(*Reduction, VPI); - Reduction = - Builder.CreateBinaryIntrinsic(Intrinsic::minimum, Reduction, Start); + Reduction = Builder.CreateBinaryIntrinsic(ScalarID, Reduction, Start); break; + } case Intrinsic::vp_reduce_fadd: Reduction = Builder.CreateFAddReduce(Start, RedOp); break; From a3f8790901cafaec8bcd863bd30b4f9ab7917bd8 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Fri, 30 Aug 2024 15:38:02 -0400 Subject: [PATCH 89/98] [libc++][NFC] Minor reformatting in --- libcxx/include/cstddef | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/libcxx/include/cstddef b/libcxx/include/cstddef index 1a4049e4d34f2d..592f6261a6de3f 100644 --- a/libcxx/include/cstddef +++ b/libcxx/include/cstddef @@ -66,8 +66,8 @@ using ::max_align_t _LIBCPP_USING_IF_EXISTS; _LIBCPP_END_NAMESPACE_STD #if _LIBCPP_STD_VER >= 17 -namespace std // purposefully not versioned -{ +namespace std { // purposefully not versioned + enum class byte : unsigned char {}; _LIBCPP_HIDE_FROM_ABI inline constexpr byte operator|(byte __lhs, byte __rhs) noexcept { @@ -127,7 +127,6 @@ template ::value, int> = 0> } } // namespace std - -#endif +#endif // _LIBCPP_STD_VER >= 17 #endif // _LIBCPP_CSTDDEF From c53008de899653818b22c44eafd7e5eaab524e2b Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 30 Aug 2024 12:44:02 -0700 Subject: [PATCH 90/98] [VPlan] Manually jumpthread a bit of reduction code for readability [nfc] --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index f84317ba51257a..c9cee652d2d326 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1802,18 +1802,18 @@ void VPReductionRecipe::execute(VPTransformState &State) { (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, NewVecOp); PrevInChain = NewRed; + NextInChain = NewRed; } else { PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true); NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) + NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), + NewRed, PrevInChain); + else + NextInChain = State.Builder.CreateBinOp( + (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, + PrevInChain); } - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { - NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), - NewRed, PrevInChain); - } else if (IsOrdered) - NextInChain = NewRed; - else - NextInChain = State.Builder.CreateBinOp( - (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); State.set(this, NextInChain, Part, /*IsScalar*/ true); } } From 923a1c1fc348f7c30ff4726b54ed63ce403dc3ce Mon Sep 17 00:00:00 2001 From: Brendan Dahl Date: Fri, 30 Aug 2024 13:01:16 -0700 Subject: [PATCH 91/98] [WebAssembly] Update FP16 opcodes to match current spec. (#106759) https://github.com/WebAssembly/half-precision/blob/f267a3d54432e5723dcc13ad4530c3581a0cc4b3/proposals/half-precision/Overview.md#binary-format --- .../WebAssembly/WebAssemblyInstrSIMD.td | 24 ++++----- llvm/test/MC/WebAssembly/simd-encodings.s | 54 +++++++++---------- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index da4b8d228f627d..9d17d90f530541 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -763,7 +763,7 @@ multiclass SIMDConditionInt baseInst> { multiclass SIMDConditionFP baseInst> { defm "" : SIMDCondition; defm "" : SIMDCondition; - defm "" : HalfPrecisionCondition; + defm "" : HalfPrecisionCondition; } // Equality: eq @@ -1218,7 +1218,7 @@ multiclass SIMDUnaryFP baseInst> { // Unlike F32x4 and F64x2 there's not a gap in the opcodes between "neg" and // "sqrt" so subtract one from the offset. defm "" : HalfPrecisionUnary; + !add(baseInst,!if(!eq(name, "sqrt"), 79, 80))>; } // Absolute value: abs @@ -1239,10 +1239,10 @@ defm CEIL : SIMDUnary; defm FLOOR : SIMDUnary; defm TRUNC: SIMDUnary; defm NEAREST: SIMDUnary; -defm CEIL : HalfPrecisionUnary; -defm FLOOR : HalfPrecisionUnary; -defm TRUNC : HalfPrecisionUnary; -defm NEAREST : HalfPrecisionUnary; +defm CEIL : HalfPrecisionUnary; +defm FLOOR : HalfPrecisionUnary; +defm TRUNC : HalfPrecisionUnary; +defm NEAREST : HalfPrecisionUnary; // WebAssembly doesn't expose inexact exceptions, so map frint to fnearbyint. def : Pat<(v4f32 (frint (v4f32 V128:$src))), (NEAREST_F32x4 V128:$src)>; @@ -1261,7 +1261,7 @@ def : Pat<(v8f16 (froundeven (v8f16 V128:$src))), (NEAREST_F16x8 V128:$src)>; multiclass SIMDBinaryFP baseInst> { defm "" : SIMDBinary; defm "" : SIMDBinary; - defm "" : HalfPrecisionBinary; + defm "" : HalfPrecisionBinary; } // Addition: add @@ -1362,8 +1362,8 @@ multiclass HalfPrecisionConvert; defm "" : SIMDConvert; -defm "" : HalfPrecisionConvert; -defm "" : HalfPrecisionConvert; +defm "" : HalfPrecisionConvert; +defm "" : HalfPrecisionConvert; // Support the saturating variety as well. def trunc_s_sat32 : PatFrag<(ops node:$x), (fp_to_sint_sat $x, i32)>; @@ -1394,8 +1394,8 @@ defm "" : SIMDConvert; defm "" : SIMDConvert; defm "" : SIMDConvert; defm "" : SIMDConvert; -defm "" : HalfPrecisionConvert; -defm "" : HalfPrecisionConvert; +defm "" : HalfPrecisionConvert; +defm "" : HalfPrecisionConvert; // Extending operations // TODO: refactor this to be uniform for i64x2 if the numbering is not changed. @@ -1538,7 +1538,7 @@ multiclass SIMDMADD simdopA, bits<32> simdopS, list defm "" : SIMDMADD; defm "" : SIMDMADD; -defm "" : SIMDMADD; +defm "" : SIMDMADD; //===----------------------------------------------------------------------===// // Laneselect diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s index 45335b348b7e8f..48aec4bc52a0c5 100644 --- a/llvm/test/MC/WebAssembly/simd-encodings.s +++ b/llvm/test/MC/WebAssembly/simd-encodings.s @@ -854,85 +854,85 @@ main: # CHECK: f16x8.replace_lane 1 # encoding: [0xfd,0xa2,0x02,0x01] f16x8.replace_lane 1 - # CHECK: f16x8.add # encoding: [0xfd,0xb4,0x02] + # CHECK: f16x8.add # encoding: [0xfd,0xbd,0x02] f16x8.add - # CHECK: f16x8.sub # encoding: [0xfd,0xb5,0x02] + # CHECK: f16x8.sub # encoding: [0xfd,0xbe,0x02] f16x8.sub - # CHECK: f16x8.mul # encoding: [0xfd,0xb6,0x02] + # CHECK: f16x8.mul # encoding: [0xfd,0xbf,0x02] f16x8.mul - # CHECK: f16x8.div # encoding: [0xfd,0xb7,0x02] + # CHECK: f16x8.div # encoding: [0xfd,0xc0,0x02] f16x8.div - # CHECK: f16x8.min # encoding: [0xfd,0xb8,0x02] + # CHECK: f16x8.min # encoding: [0xfd,0xc1,0x02] f16x8.min - # CHECK: f16x8.max # encoding: [0xfd,0xb9,0x02] + # CHECK: f16x8.max # encoding: [0xfd,0xc2,0x02] f16x8.max - # CHECK: f16x8.pmin # encoding: [0xfd,0xba,0x02] + # CHECK: f16x8.pmin # encoding: [0xfd,0xc3,0x02] f16x8.pmin - # CHECK: f16x8.pmax # encoding: [0xfd,0xbb,0x02] + # CHECK: f16x8.pmax # encoding: [0xfd,0xc4,0x02] f16x8.pmax - # CHECK: f16x8.eq # encoding: [0xfd,0xc0,0x02] + # CHECK: f16x8.eq # encoding: [0xfd,0xb7,0x02] f16x8.eq - # CHECK: f16x8.ne # encoding: [0xfd,0xc1,0x02] + # CHECK: f16x8.ne # encoding: [0xfd,0xb8,0x02] f16x8.ne - # CHECK: f16x8.lt # encoding: [0xfd,0xc2,0x02] + # CHECK: f16x8.lt # encoding: [0xfd,0xb9,0x02] f16x8.lt - # CHECK: f16x8.gt # encoding: [0xfd,0xc3,0x02] + # CHECK: f16x8.gt # encoding: [0xfd,0xba,0x02] f16x8.gt - # CHECK: f16x8.le # encoding: [0xfd,0xc4,0x02] + # CHECK: f16x8.le # encoding: [0xfd,0xbb,0x02] f16x8.le - # CHECK: f16x8.ge # encoding: [0xfd,0xc5,0x02] + # CHECK: f16x8.ge # encoding: [0xfd,0xbc,0x02] f16x8.ge - # CHECK: f16x8.abs # encoding: [0xfd,0xb1,0x02] + # CHECK: f16x8.abs # encoding: [0xfd,0xb0,0x02] f16x8.abs - # CHECK: f16x8.neg # encoding: [0xfd,0xb2,0x02] + # CHECK: f16x8.neg # encoding: [0xfd,0xb1,0x02] f16x8.neg - # CHECK: f16x8.sqrt # encoding: [0xfd,0xb3,0x02] + # CHECK: f16x8.sqrt # encoding: [0xfd,0xb2,0x02] f16x8.sqrt - # CHECK: f16x8.ceil # encoding: [0xfd,0xbc,0x02] + # CHECK: f16x8.ceil # encoding: [0xfd,0xb3,0x02] f16x8.ceil - # CHECK: f16x8.floor # encoding: [0xfd,0xbd,0x02] + # CHECK: f16x8.floor # encoding: [0xfd,0xb4,0x02] f16x8.floor - # CHECK: f16x8.trunc # encoding: [0xfd,0xbe,0x02] + # CHECK: f16x8.trunc # encoding: [0xfd,0xb5,0x02] f16x8.trunc - # CHECK: f16x8.nearest # encoding: [0xfd,0xbf,0x02] + # CHECK: f16x8.nearest # encoding: [0xfd,0xb6,0x02] f16x8.nearest - # CHECK: f16x8.relaxed_madd # encoding: [0xfd,0xc6,0x02] + # CHECK: f16x8.relaxed_madd # encoding: [0xfd,0xce,0x02] f16x8.relaxed_madd - # CHECK: f16x8.relaxed_nmadd # encoding: [0xfd,0xc7,0x02] + # CHECK: f16x8.relaxed_nmadd # encoding: [0xfd,0xcf,0x02] f16x8.relaxed_nmadd - # CHECK: i16x8.trunc_sat_f16x8_s # encoding: [0xfd,0xc8,0x02] + # CHECK: i16x8.trunc_sat_f16x8_s # encoding: [0xfd,0xc5,0x02] i16x8.trunc_sat_f16x8_s - # CHECK: i16x8.trunc_sat_f16x8_u # encoding: [0xfd,0xc9,0x02] + # CHECK: i16x8.trunc_sat_f16x8_u # encoding: [0xfd,0xc6,0x02] i16x8.trunc_sat_f16x8_u - # CHECK: f16x8.convert_i16x8_s # encoding: [0xfd,0xca,0x02] + # CHECK: f16x8.convert_i16x8_s # encoding: [0xfd,0xc7,0x02] f16x8.convert_i16x8_s - # CHECK: f16x8.convert_i16x8_u # encoding: [0xfd,0xcb,0x02] + # CHECK: f16x8.convert_i16x8_u # encoding: [0xfd,0xc8,0x02] f16x8.convert_i16x8_u end_function From 5e7f0dcd69fd666bbb2a93d20e6a56a11261b519 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 30 Aug 2024 13:16:26 -0700 Subject: [PATCH 92/98] [lldb] Include checksum in source cache dump (#106773) This patch updates the source cache dump command to print both the actual (on-disk) checksum and the expected (line table) checksum. To achieve that we now read and store the on-disk checksum in the cached object. The same information will be used in a future path to print a warning when the checksums differ. --- lldb/include/lldb/Core/SourceManager.h | 6 ++++++ lldb/source/Core/SourceManager.cpp | 27 +++++++++++++++++++------- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/lldb/include/lldb/Core/SourceManager.h b/lldb/include/lldb/Core/SourceManager.h index ae7bd3d2311f96..172824dc78a6bc 100644 --- a/lldb/include/lldb/Core/SourceManager.h +++ b/lldb/include/lldb/Core/SourceManager.h @@ -9,6 +9,7 @@ #ifndef LLDB_CORE_SOURCEMANAGER_H #define LLDB_CORE_SOURCEMANAGER_H +#include "lldb/Utility/Checksum.h" #include "lldb/Utility/FileSpec.h" #include "lldb/lldb-defines.h" #include "lldb/lldb-forward.h" @@ -71,6 +72,8 @@ class SourceManager { llvm::sys::TimePoint<> GetTimestamp() const { return m_mod_time; } + const Checksum &GetChecksum() const { return m_checksum; } + protected: /// Set file and update modification time. void SetSupportFile(lldb::SupportFileSP support_file_sp); @@ -81,6 +84,9 @@ class SourceManager { /// different from the original support file passed to the constructor. lldb::SupportFileSP m_support_file_sp; + /// Keep track of the on-disk checksum. + Checksum m_checksum; + // Keep the modification time that this file data is valid for llvm::sys::TimePoint<> m_mod_time; diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp index c427bb91f4643a..f6e59ce731a573 100644 --- a/lldb/source/Core/SourceManager.cpp +++ b/lldb/source/Core/SourceManager.cpp @@ -447,13 +447,14 @@ void SourceManager::FindLinesMatchingRegex(SupportFileSP support_file_sp, SourceManager::File::File(SupportFileSP support_file_sp, lldb::DebuggerSP debugger_sp) - : m_support_file_sp(std::make_shared()), m_mod_time(), - m_debugger_wp(debugger_sp), m_target_wp(TargetSP()) { + : m_support_file_sp(std::make_shared()), m_checksum(), + m_mod_time(), m_debugger_wp(debugger_sp), m_target_wp(TargetSP()) { CommonInitializer(support_file_sp, {}); } SourceManager::File::File(SupportFileSP support_file_sp, TargetSP target_sp) - : m_support_file_sp(std::make_shared()), m_mod_time(), + : m_support_file_sp(std::make_shared()), m_checksum(), + m_mod_time(), m_debugger_wp(target_sp ? target_sp->GetDebugger().shared_from_this() : DebuggerSP()), m_target_wp(target_sp) { @@ -532,9 +533,11 @@ void SourceManager::File::CommonInitializer(SupportFileSP support_file_sp, } // If the file exists, read in the data. - if (m_mod_time != llvm::sys::TimePoint<>()) + if (m_mod_time != llvm::sys::TimePoint<>()) { m_data_sp = FileSystem::Instance().CreateDataBuffer( m_support_file_sp->GetSpecOnly()); + m_checksum = llvm::MD5::hash(m_data_sp->GetData()); + } } void SourceManager::File::SetSupportFile(lldb::SupportFileSP support_file_sp) { @@ -835,14 +838,24 @@ SourceManager::FileSP SourceManager::SourceFileCache::FindSourceFile( return {}; } +static std::string toString(const Checksum &checksum) { + if (!checksum) + return ""; + return std::string(llvm::formatv("{0}", checksum.digest())); +} + void SourceManager::SourceFileCache::Dump(Stream &stream) const { - stream << "Modification time Lines Path\n"; - stream << "------------------- -------- --------------------------------\n"; + // clang-format off + stream << "Modification time MD5 Checksum (on-disk) MD5 Checksum (line table) Lines Path\n"; + stream << "------------------- -------------------------------- -------------------------------- -------- --------------------------------\n"; + // clang-format on for (auto &entry : m_file_cache) { if (!entry.second) continue; FileSP file = entry.second; - stream.Format("{0:%Y-%m-%d %H:%M:%S} {1,8:d} {2}\n", file->GetTimestamp(), + stream.Format("{0:%Y-%m-%d %H:%M:%S} {1,32} {2,32} {3,8:d} {4}\n", + file->GetTimestamp(), toString(file->GetChecksum()), + toString(file->GetSupportFile()->GetChecksum()), file->GetNumLines(), entry.first.GetPath()); } } From 432e9f44101e44bb996c350cf5693038916953f3 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 30 Aug 2024 13:19:31 -0700 Subject: [PATCH 93/98] [llvm][LoongArch] Avoid shift overflow (#106785) Follow up fix to #106332 `LoongArchMatInt.cpp:96:33: runtime error: shift exponent 64 is too large for 64-bit type` https://lab.llvm.org/buildbot/#/builders/169/builds/2681 --- llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp index 6ad2c003558a51..a7823470382756 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp @@ -92,8 +92,9 @@ LoongArchMatInt::InstSeq LoongArchMatInt::generateInstSeq(int64_t Val) { break; } - for (uint64_t Msb = 32; Msb < 64; ++Msb) { - uint64_t HighMask = ~((1ULL << (Msb + 1)) - 1); + uint64_t Msb = 32; + uint64_t HighMask = ~((1ULL << (Msb + 1)) - 1); + for (; Msb < 64; ++Msb, HighMask = (HighMask << 1) + 1) { for (uint64_t Lsb = Msb; Lsb > 0; --Lsb) { uint64_t LowMask = (1ULL << Lsb) - 1; uint64_t Mask = HighMask | LowMask; From 982d2445f2a5bad96c501ff23923648ffa094ef2 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 30 Aug 2024 13:51:53 -0700 Subject: [PATCH 94/98] Revert "AtomicExpand: Allow incrementally legalizing atomicrmw" (#106792) Reverts llvm/llvm-project#103371 There is `heap-use-after-free`, commented on 206b5aff44a95754f6dd7a5696efa024e983ac59 Maybe `if (Next == E || BB != Next->getParent()) {` is enough, but not sure, what was the intent there, --- llvm/lib/CodeGen/AtomicExpandPass.cpp | 35 +- llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll | 373 +++++++++----------- llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 373 +++++++++----------- llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 373 +++++++++----------- llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll | 373 +++++++++----------- 5 files changed, 691 insertions(+), 836 deletions(-) diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index b9732e816ea7e6..39a705599f90cc 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -351,30 +351,17 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) { bool MadeChange = false; - for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE;) { - BasicBlock *BB = &*BBI; - ++BBI; - - BasicBlock::iterator Next; - - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; - I = Next) { - Instruction &Inst = *I; - Next = std::next(I); - - if (processAtomicInstr(&Inst)) { - MadeChange = true; - - // Detect control flow change and resume iteration from the original - // block to inspect any newly inserted blocks. This allows incremental - // legalizaton of atomicrmw and cmpxchg. - if (BB != Next->getParent()) { - BBI = BB->getIterator(); - BBE = F.end(); - break; - } - } - } + SmallVector AtomicInsts; + + // Changing control-flow while iterating through it is a bad idea, so gather a + // list of all atomic instructions before we start. + for (Instruction &I : instructions(F)) + if (I.isAtomic() && !isa(&I)) + AtomicInsts.push_back(&I); + + for (auto *I : AtomicInsts) { + if (processAtomicInstr(I)) + MadeChange = true; } return MadeChange; diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll index ed9c1b037d0cc7..0d230bb9dcc6e9 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -43,49 +43,46 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -131,49 +128,46 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -238,40 +232,36 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2 @@ -337,40 +327,36 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 4 @@ -413,38 +399,35 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldr w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w1, w20 -; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4 @@ -486,40 +469,36 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x20, x1 +; SOFTFP-NOLSE-NEXT: mov x21, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x21 -; SOFTFP-NOLSE-NEXT: mov x1, x20 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 ; SOFTFP-NOLSE-NEXT: bl __adddf3 -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x21 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8 @@ -708,18 +687,18 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -732,33 +711,29 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w22 +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] -; SOFTFP-NOLSE-NEXT: cmp w22, w8 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -824,18 +799,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 @@ -845,28 +819,25 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] -; SOFTFP-NOLSE-NEXT: cmp w22, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -914,49 +885,45 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB9_6 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 ; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: mov w9, w22 -; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB9_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] -; SOFTFP-NOLSE-NEXT: cmp x22, x9 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB9_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 -; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index 888b795876f7df..bfe0d20ca814bc 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -45,49 +45,46 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -133,49 +130,46 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -240,40 +234,36 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2 @@ -339,40 +329,36 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 4 @@ -415,38 +401,35 @@ define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldr w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w1, w20 -; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4 @@ -488,40 +471,36 @@ define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x20, x1 +; SOFTFP-NOLSE-NEXT: mov x21, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x21 -; SOFTFP-NOLSE-NEXT: mov x1, x20 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 ; SOFTFP-NOLSE-NEXT: bl fmax -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x21 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8 @@ -588,18 +567,18 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB6_6 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_5 ; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 @@ -612,33 +591,29 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w22 +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB6_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] -; SOFTFP-NOLSE-NEXT: cmp w22, w8 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB6_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB6_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -748,18 +723,17 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -769,28 +743,25 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] -; SOFTFP-NOLSE-NEXT: cmp w22, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -838,49 +809,45 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: mov w9, w22 -; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] -; SOFTFP-NOLSE-NEXT: cmp x22, x9 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 -; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index a3665c6e428608..6b7d2df044460a 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -45,49 +45,46 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -133,49 +130,46 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -240,40 +234,36 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2 @@ -339,40 +329,36 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 4 @@ -415,38 +401,35 @@ define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldr w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w1, w20 -; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4 @@ -488,40 +471,36 @@ define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x20, x1 +; SOFTFP-NOLSE-NEXT: mov x21, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x21 -; SOFTFP-NOLSE-NEXT: mov x1, x20 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 ; SOFTFP-NOLSE-NEXT: bl fmin -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x21 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8 @@ -588,18 +567,18 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB6_6 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB6_5 ; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 @@ -612,33 +591,29 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w22 +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB6_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB6_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] -; SOFTFP-NOLSE-NEXT: cmp w22, w8 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB6_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB6_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB6_3 +; SOFTFP-NOLSE-NEXT: b .LBB6_1 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -748,18 +723,17 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -769,28 +743,25 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] -; SOFTFP-NOLSE-NEXT: cmp w22, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -838,49 +809,45 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: mov w9, w22 -; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] -; SOFTFP-NOLSE-NEXT: cmp x22, x9 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 -; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll index 7725ce0e731859..67e164037d5ce7 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -43,49 +43,46 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB0_5 ; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB0_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB0_3 +; SOFTFP-NOLSE-NEXT: b .LBB0_1 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2 ret half %res @@ -131,49 +128,46 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4: ; SOFTFP-NOLSE: // %bb.0: -; SOFTFP-NOLSE-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; SOFTFP-NOLSE-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w23 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB1_5 ; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w22, w0 -; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w23, w20, #0xffff +; SOFTFP-NOLSE-NEXT: mov w22, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB1_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB1_3 +; SOFTFP-NOLSE-NEXT: b .LBB1_1 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; SOFTFP-NOLSE-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 4 ret half %res @@ -238,40 +232,36 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB2_5 ; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB2_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB2_3 +; SOFTFP-NOLSE-NEXT: b .LBB2_1 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2 @@ -337,40 +327,36 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldrh w20, [x0] +; SOFTFP-NOLSE-NEXT: lsl w21, w1, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] -; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 ; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB3_5 ; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w21, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w20 +; SOFTFP-NOLSE-NEXT: lsl w0, w20, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB3_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth +; SOFTFP-NOLSE-NEXT: ldaxrh w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20, uxth ; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxrh wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB3_3 +; SOFTFP-NOLSE-NEXT: b .LBB3_1 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 4 @@ -413,38 +399,35 @@ define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr w20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: ldr w0, [x0] -; SOFTFP-NOLSE-NEXT: mov w20, w1 +; SOFTFP-NOLSE-NEXT: mov w21, w1 ; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 +; SOFTFP-NOLSE-NEXT: cmp w8, w20 +; SOFTFP-NOLSE-NEXT: mov w20, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB4_5 ; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w1, w20 -; SOFTFP-NOLSE-NEXT: mov w21, w0 +; SOFTFP-NOLSE-NEXT: mov w0, w20 +; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB4_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] -; SOFTFP-NOLSE-NEXT: cmp w0, w21 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x19] +; SOFTFP-NOLSE-NEXT: cmp w8, w20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB4_3 +; SOFTFP-NOLSE-NEXT: b .LBB4_1 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4 @@ -486,40 +469,36 @@ define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8: ; SOFTFP-NOLSE: // %bb.0: ; SOFTFP-NOLSE-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldr x21, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SOFTFP-NOLSE-NEXT: ldr x20, [x0] ; SOFTFP-NOLSE-NEXT: mov x19, x0 -; SOFTFP-NOLSE-NEXT: mov x20, x1 +; SOFTFP-NOLSE-NEXT: mov x21, x1 ; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_6 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 +; SOFTFP-NOLSE-NEXT: mov x20, x8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB5_5 ; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov x0, x21 -; SOFTFP-NOLSE-NEXT: mov x1, x20 +; SOFTFP-NOLSE-NEXT: mov x0, x20 +; SOFTFP-NOLSE-NEXT: mov x1, x21 ; SOFTFP-NOLSE-NEXT: bl __subdf3 -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB5_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x8, [x19] -; SOFTFP-NOLSE-NEXT: cmp x8, x21 +; SOFTFP-NOLSE-NEXT: cmp x8, x20 ; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, x0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w9, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: mov x21, x8 -; SOFTFP-NOLSE-NEXT: cbz w9, .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: stlxr wzr, x0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB5_3 +; SOFTFP-NOLSE-NEXT: b .LBB5_1 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov x0, x20 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; SOFTFP-NOLSE-NEXT: mov x0, x21 ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ret %res = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8 @@ -708,18 +687,18 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w23, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] -; SOFTFP-NOLSE-NEXT: mov w21, w1 +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] +; SOFTFP-NOLSE-NEXT: mov w22, w1 ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 +; SOFTFP-NOLSE-NEXT: lsr w23, w8, #16 +; SOFTFP-NOLSE-NEXT: cmp w8, w21 +; SOFTFP-NOLSE-NEXT: mov w21, w8 +; SOFTFP-NOLSE-NEXT: b.eq .LBB7_5 ; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 @@ -732,33 +711,29 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w25, w0 -; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff +; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff ; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee -; SOFTFP-NOLSE-NEXT: mov w8, w22 +; SOFTFP-NOLSE-NEXT: bfi w21, w23, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB7_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] -; SOFTFP-NOLSE-NEXT: cmp w22, w8 +; SOFTFP-NOLSE-NEXT: ldaxr w8, [x20] +; SOFTFP-NOLSE-NEXT: cmp w8, w21 ; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB7_3 +; SOFTFP-NOLSE-NEXT: b .LBB7_1 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: mov w1, w23 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload @@ -824,18 +799,17 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: ldrh w1, [x0, #2] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: ldrh w22, [x0] +; SOFTFP-NOLSE-NEXT: ldrh w21, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w2, #16 -; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 +; SOFTFP-NOLSE-NEXT: lsl w22, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 +; SOFTFP-NOLSE-NEXT: lsr w1, w21, #16 +; SOFTFP-NOLSE-NEXT: cmp w21, w23 +; SOFTFP-NOLSE-NEXT: b.eq .LBB8_5 ; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 @@ -845,28 +819,25 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: lsl w0, w22, #16 -; SOFTFP-NOLSE-NEXT: mov w1, w21 +; SOFTFP-NOLSE-NEXT: lsl w0, w21, #16 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 -; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 +; SOFTFP-NOLSE-NEXT: bfxil w23, w21, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: .LBB8_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] -; SOFTFP-NOLSE-NEXT: cmp w22, w23 +; SOFTFP-NOLSE-NEXT: ldaxr w21, [x19] +; SOFTFP-NOLSE-NEXT: cmp w21, w23 ; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: stlxr wzr, w0, [x19] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB8_3 +; SOFTFP-NOLSE-NEXT: b .LBB8_1 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w21 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload @@ -914,49 +885,45 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w21, w1 -; SOFTFP-NOLSE-NEXT: ldp w22, w23, [x0] +; SOFTFP-NOLSE-NEXT: ldp w23, w22, [x0] ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 ; SOFTFP-NOLSE-NEXT: b .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_1: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB9_6 +; SOFTFP-NOLSE-NEXT: lsr x22, x23, #32 +; SOFTFP-NOLSE-NEXT: cmp x23, x8 +; SOFTFP-NOLSE-NEXT: // kill: def $w22 killed $w22 killed $x22 def $x22 +; SOFTFP-NOLSE-NEXT: b.eq .LBB9_5 ; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 -; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: mov w24, w0 -; SOFTFP-NOLSE-NEXT: mov w0, w22 +; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: mov w9, w22 -; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 -; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 -; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB9_3: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: mov w8, w23 +; SOFTFP-NOLSE-NEXT: mov w9, w0 +; SOFTFP-NOLSE-NEXT: orr x9, x9, x24, lsl #32 +; SOFTFP-NOLSE-NEXT: orr x8, x8, x22, lsl #32 +; SOFTFP-NOLSE-NEXT: .LBB9_3: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] -; SOFTFP-NOLSE-NEXT: cmp x22, x9 +; SOFTFP-NOLSE-NEXT: ldaxr x23, [x20] +; SOFTFP-NOLSE-NEXT: cmp x23, x8 ; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // %bb.4: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB9_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_6: // %atomicrmw.end -; SOFTFP-NOLSE-NEXT: mov w0, w22 -; SOFTFP-NOLSE-NEXT: mov w1, w23 +; SOFTFP-NOLSE-NEXT: stlxr wzr, x9, [x20] +; SOFTFP-NOLSE-NEXT: cbnz wzr, .LBB9_3 +; SOFTFP-NOLSE-NEXT: b .LBB9_1 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; SOFTFP-NOLSE-NEXT: mov w0, w23 +; SOFTFP-NOLSE-NEXT: mov w1, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload From 57fe53cae40351ebd079a9a0105addf4ad2e97dd Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Fri, 30 Aug 2024 16:54:09 -0400 Subject: [PATCH 95/98] [libc++] First attempt to regroup a few modules in the modulemap (#98214) We split up all the headers into top-level modules when we broke up cycles with the C compatibility headers. However, this resulted in a large number of small modules, which is awkward and clearly against the philosophy of Clang modules. This was necessary to make things work. This patch regroups a few headers from two leaf modules: stop_token and pstl. It should be pretty uncontroversial that grouping these headers into a single module doesn't introduce any cyclic dependency, yet it's a first step towards reducing the number of top-level modules we have in our modulemap. --- libcxx/include/module.modulemap | 66 ++++++------------- .../atomic_unique_lock.pass.cpp | 7 +- .../intrusive_list_view.pass.cpp | 1 + .../intrusive_shared_ptr.pass.cpp | 1 + 4 files changed, 25 insertions(+), 50 deletions(-) diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 13d0dce34d97e3..f193b5d95f49f5 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -245,8 +245,15 @@ module std_stdexcept [system] { header "stdexcept" export * } -module std_stop_token { +module std_stop_token [system] { header "stop_token" + private header "__stop_token/atomic_unique_lock.h" + private header "__stop_token/intrusive_list_view.h" + private header "__stop_token/intrusive_shared_ptr.h" + private header "__stop_token/stop_callback.h" + private header "__stop_token/stop_source.h" + private header "__stop_token/stop_state.h" + private header "__stop_token/stop_token.h" export * } module std_streambuf [system] { @@ -1592,41 +1599,25 @@ module std_private_numeric_transform_exclusive_scan [system] { header "__numeric module std_private_numeric_transform_inclusive_scan [system] { header "__numeric/transform_inclusive_scan.h" } module std_private_numeric_transform_reduce [system] { header "__numeric/transform_reduce.h" } -module std_private_pstl_backend [system] { +module std_private_pstl [system] { header "__pstl/backend.h" - export * -} -module std_private_pstl_backend_fwd [system] { header "__pstl/backend_fwd.h" - export * -} -module std_private_pstl_backends_default [system] { header "__pstl/backends/default.h" - export * -} -module std_private_pstl_backends_libdispatch [system] { header "__pstl/backends/libdispatch.h" - export * -} -module std_private_pstl_backends_serial [system] { header "__pstl/backends/serial.h" - export * -} -module std_private_pstl_backends_std_thread [system] { header "__pstl/backends/std_thread.h" - export * + header "__pstl/cpu_algos/any_of.h" + header "__pstl/cpu_algos/cpu_traits.h" + header "__pstl/cpu_algos/fill.h" + header "__pstl/cpu_algos/find_if.h" + header "__pstl/cpu_algos/for_each.h" + header "__pstl/cpu_algos/merge.h" + header "__pstl/cpu_algos/stable_sort.h" + header "__pstl/cpu_algos/transform.h" + header "__pstl/cpu_algos/transform_reduce.h" + header "__pstl/dispatch.h" + header "__pstl/handle_exception.h" } -module std_private_pstl_cpu_algos_any_of [system] { header "__pstl/cpu_algos/any_of.h" } -module std_private_pstl_cpu_algos_cpu_traits [system] { header "__pstl/cpu_algos/cpu_traits.h" } -module std_private_pstl_cpu_algos_fill [system] { header "__pstl/cpu_algos/fill.h" } -module std_private_pstl_cpu_algos_find_if [system] { header "__pstl/cpu_algos/find_if.h" } -module std_private_pstl_cpu_algos_for_each [system] { header "__pstl/cpu_algos/for_each.h" } -module std_private_pstl_cpu_algos_merge [system] { header "__pstl/cpu_algos/merge.h" } -module std_private_pstl_cpu_algos_stable_sort [system] { header "__pstl/cpu_algos/stable_sort.h" } -module std_private_pstl_cpu_algos_transform [system] { header "__pstl/cpu_algos/transform.h" } -module std_private_pstl_cpu_algos_transform_reduce [system] { header "__pstl/cpu_algos/transform_reduce.h" } -module std_private_pstl_dispatch [system] { header "__pstl/dispatch.h" } -module std_private_pstl_handle_exception [system] { header "__pstl/handle_exception.h" } module std_private_queue_fwd [system] { header "__fwd/queue.h" } @@ -1781,23 +1772,6 @@ module std_private_span_span_fwd [system] { header "__fwd/span.h" } module std_private_stack_fwd [system] { header "__fwd/stack.h" } -module std_private_stop_token_atomic_unique_lock [system] { header "__stop_token/atomic_unique_lock.h" } -module std_private_stop_token_intrusive_list_view [system] { header "__stop_token/intrusive_list_view.h" } -module std_private_stop_token_intrusive_shared_ptr [system] { header "__stop_token/intrusive_shared_ptr.h" } -module std_private_stop_token_stop_callback [system] { header "__stop_token/stop_callback.h" } -module std_private_stop_token_stop_source [system] { - header "__stop_token/stop_source.h" - export * -} -module std_private_stop_token_stop_state [system] { - header "__stop_token/stop_state.h" - export * -} -module std_private_stop_token_stop_token [system] { - header "__stop_token/stop_token.h" - export * -} - module std_private_string_char_traits [system] { header "__string/char_traits.h" export * diff --git a/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp b/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp index 2a9b828f4389ce..44d51921ac74ad 100644 --- a/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp +++ b/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp @@ -5,12 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: no-threads - -// XFAIL: availability-synchronization_library-missing +// UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11, c++14, c++17 +// XFAIL: availability-synchronization_library-missing +// ADDITIONAL_COMPILE_FLAGS: -Wno-private-header #include <__stop_token/atomic_unique_lock.h> #include diff --git a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp index 85cd9786258955..d8cd2fb68e132e 100644 --- a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp +++ b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp @@ -8,6 +8,7 @@ // // UNSUPPORTED: c++03, c++11, c++14, c++17 +// ADDITIONAL_COMPILE_FLAGS: -Wno-private-header #include <__stop_token/intrusive_list_view.h> #include diff --git a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp index 47440015f2c50c..99d4226662a0b7 100644 --- a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp +++ b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp @@ -8,6 +8,7 @@ // // UNSUPPORTED: c++03, c++11, c++14, c++17 +// ADDITIONAL_COMPILE_FLAGS: -Wno-private-header #include <__stop_token/intrusive_shared_ptr.h> #include From 06c531e808ceeafdf996867a2e8e66960ae4774e Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Fri, 30 Aug 2024 14:00:33 -0700 Subject: [PATCH 96/98] BPF: Generate locked insn for __sync_fetch_and_add() with cpu v1/v2 (#106494) This patch contains two pars: - first to revert the patch https://github.com/llvm/llvm-project/pull/101428. - second to remove `atomic_fetch_and_*()` to `atomic_()` conversion (when return value is not used), but preserve `__sync_fetch_and_add()` to locked insn with cpu v1/v2. --- llvm/lib/Target/BPF/BPF.h | 2 + llvm/lib/Target/BPF/BPFInstrInfo.td | 76 +++------- llvm/lib/Target/BPF/BPFMIChecking.cpp | 181 +++++++++++++++++++++++ llvm/lib/Target/BPF/BPFTargetMachine.cpp | 1 + llvm/lib/Target/BPF/CMakeLists.txt | 1 + llvm/test/CodeGen/BPF/atomics.ll | 15 +- llvm/test/CodeGen/BPF/atomics_2.ll | 2 +- llvm/test/CodeGen/BPF/objdump_atomics.ll | 4 +- llvm/test/CodeGen/BPF/xadd.ll | 59 ++++++++ llvm/test/CodeGen/BPF/xadd_legal.ll | 2 +- 10 files changed, 280 insertions(+), 63 deletions(-) create mode 100644 llvm/lib/Target/BPF/BPFMIChecking.cpp create mode 100644 llvm/test/CodeGen/BPF/xadd.ll diff --git a/llvm/lib/Target/BPF/BPF.h b/llvm/lib/Target/BPF/BPF.h index f7bc6f958470b9..f07ae4c9baf1c6 100644 --- a/llvm/lib/Target/BPF/BPF.h +++ b/llvm/lib/Target/BPF/BPF.h @@ -28,6 +28,7 @@ FunctionPass *createBPFISelDag(BPFTargetMachine &TM); FunctionPass *createBPFMISimplifyPatchablePass(); FunctionPass *createBPFMIPeepholePass(); FunctionPass *createBPFMIPreEmitPeepholePass(); +FunctionPass *createBPFMIPreEmitCheckingPass(); InstructionSelector *createBPFInstructionSelector(const BPFTargetMachine &, const BPFSubtarget &, @@ -36,6 +37,7 @@ InstructionSelector *createBPFInstructionSelector(const BPFTargetMachine &, void initializeBPFCheckAndAdjustIRPass(PassRegistry&); void initializeBPFDAGToDAGISelLegacyPass(PassRegistry &); void initializeBPFMIPeepholePass(PassRegistry &); +void initializeBPFMIPreEmitCheckingPass(PassRegistry &); void initializeBPFMIPreEmitPeepholePass(PassRegistry &); void initializeBPFMISimplifyPatchablePass(PassRegistry &); diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td index 4baeeb017699d6..6c750af5c2fd92 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.td +++ b/llvm/lib/Target/BPF/BPFInstrInfo.td @@ -786,45 +786,13 @@ let Predicates = [BPFNoALU32] in { def : Pat<(i64 (extloadi32 ADDRri:$src)), (i64 (LDW ADDRri:$src))>; } -// Atomic XADD for BPFNoALU32 -class XADD - : TYPE_LD_ST { - bits<4> dst; - bits<20> addr; - - let Inst{51-48} = addr{19-16}; // base reg - let Inst{55-52} = dst; - let Inst{47-32} = addr{15-0}; // offset - let Inst{7-4} = BPF_ADD.Value; - let BPFClass = BPF_STX; -} - // Atomic add, and, or, xor -class ATOMIC_NOFETCH - : TYPE_LD_ST + : TYPE_LD_ST { - bits<4> dst; - bits<20> addr; - - let Inst{51-48} = addr{19-16}; // base reg - let Inst{55-52} = dst; - let Inst{47-32} = addr{15-0}; // offset - let Inst{7-4} = Opc.Value; - let BPFClass = BPF_STX; -} - -class ATOMIC32_NOFETCH - : TYPE_LD_ST { bits<4> dst; bits<20> addr; @@ -838,16 +806,23 @@ class ATOMIC32_NOFETCH let Constraints = "$dst = $val" in { let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in { - def XADDW32 : ATOMIC32_NOFETCH; - def XANDW32 : ATOMIC32_NOFETCH; - def XORW32 : ATOMIC32_NOFETCH; - def XXORW32 : ATOMIC32_NOFETCH; + def XADDW32 : ATOMIC_NOFETCH; + def XANDW32 : ATOMIC_NOFETCH; + def XORW32 : ATOMIC_NOFETCH; + def XXORW32 : ATOMIC_NOFETCH; } + def XADDW : ATOMIC_NOFETCH; + def XADDD : ATOMIC_NOFETCH; + def XANDD : ATOMIC_NOFETCH; + def XORD : ATOMIC_NOFETCH; + def XXORD : ATOMIC_NOFETCH; +} - def XADDD : ATOMIC_NOFETCH; - def XANDD : ATOMIC_NOFETCH; - def XORD : ATOMIC_NOFETCH; - def XXORD : ATOMIC_NOFETCH; +let Predicates = [BPFNoALU32] in { + def : Pat<(atomic_load_add_i32 ADDRri:$addr, GPR:$val), + (XADDW ADDRri:$addr, GPR:$val)>; + def : Pat<(atomic_load_add_i64 ADDRri:$addr, GPR:$val), + (XADDD ADDRri:$addr, GPR:$val)>; } // Atomic Fetch-and- operations @@ -887,13 +862,6 @@ class XFALU32; - def XFADDW : XFALU64; - } -} - let Constraints = "$dst = $val" in { let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in { def XFADDW32 : XFALU32; @@ -902,7 +870,9 @@ let Constraints = "$dst = $val" in { def XFXORW32 : XFALU32; } - def XFADDD : XFALU64; + let Predicates = [BPFHasALU32] in { + def XFADDD : XFALU64; + } def XFANDD : XFALU64; def XFORD : XFALU64; def XFXORD : XFALU64; diff --git a/llvm/lib/Target/BPF/BPFMIChecking.cpp b/llvm/lib/Target/BPF/BPFMIChecking.cpp new file mode 100644 index 00000000000000..24224f6c1e9e66 --- /dev/null +++ b/llvm/lib/Target/BPF/BPFMIChecking.cpp @@ -0,0 +1,181 @@ +//===-------------- BPFMIChecking.cpp - MI Checking Legality -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass performs checking to signal errors for certain illegal usages at +// MachineInstruction layer. Specially, the result of XADD{32,64} insn should +// not be used. The pass is done at the PreEmit pass right before the +// machine code is emitted at which point the register liveness information +// is still available. +// +//===----------------------------------------------------------------------===// + +#include "BPF.h" +#include "BPFInstrInfo.h" +#include "BPFTargetMachine.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "bpf-mi-checking" + +namespace { + +struct BPFMIPreEmitChecking : public MachineFunctionPass { + + static char ID; + MachineFunction *MF; + const TargetRegisterInfo *TRI; + + BPFMIPreEmitChecking() : MachineFunctionPass(ID) { + initializeBPFMIPreEmitCheckingPass(*PassRegistry::getPassRegistry()); + } + +private: + // Initialize class variables. + void initialize(MachineFunction &MFParm); + + void processAtomicInsts(); + +public: + // Main entry point for this pass. + bool runOnMachineFunction(MachineFunction &MF) override { + if (!skipFunction(MF.getFunction())) { + initialize(MF); + processAtomicInsts(); + } + return false; + } +}; + +// Initialize class variables. +void BPFMIPreEmitChecking::initialize(MachineFunction &MFParm) { + MF = &MFParm; + TRI = MF->getSubtarget().getRegisterInfo(); + LLVM_DEBUG(dbgs() << "*** BPF PreEmit checking pass ***\n\n"); +} + +// Make sure all Defs of XADD are dead, meaning any result of XADD insn is not +// used. +// +// NOTE: BPF backend hasn't enabled sub-register liveness track, so when the +// source and destination operands of XADD are GPR32, there is no sub-register +// dead info. If we rely on the generic MachineInstr::allDefsAreDead, then we +// will raise false alarm on GPR32 Def. +// +// To support GPR32 Def, ideally we could just enable sub-registr liveness track +// on BPF backend, then allDefsAreDead could work on GPR32 Def. This requires +// implementing TargetSubtargetInfo::enableSubRegLiveness on BPF. +// +// However, sub-register liveness tracking module inside LLVM is actually +// designed for the situation where one register could be split into more than +// one sub-registers for which case each sub-register could have their own +// liveness and kill one of them doesn't kill others. So, tracking liveness for +// each make sense. +// +// For BPF, each 64-bit register could only have one 32-bit sub-register. This +// is exactly the case which LLVM think brings no benefits for doing +// sub-register tracking, because the live range of sub-register must always +// equal to its parent register, therefore liveness tracking is disabled even +// the back-end has implemented enableSubRegLiveness. The detailed information +// is at r232695: +// +// Author: Matthias Braun +// Date: Thu Mar 19 00:21:58 2015 +0000 +// Do not track subregister liveness when it brings no benefits +// +// Hence, for BPF, we enhance MachineInstr::allDefsAreDead. Given the solo +// sub-register always has the same liveness as its parent register, LLVM is +// already attaching a implicit 64-bit register Def whenever the there is +// a sub-register Def. The liveness of the implicit 64-bit Def is available. +// For example, for "lock *(u32 *)(r0 + 4) += w9", the MachineOperand info could +// be: +// +// $w9 = XADDW32 killed $r0, 4, $w9(tied-def 0), +// implicit killed $r9, implicit-def dead $r9 +// +// Even though w9 is not marked as Dead, the parent register r9 is marked as +// Dead correctly, and it is safe to use such information or our purpose. +static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) { + const MCRegisterClass *GPR64RegClass = + &BPFMCRegisterClasses[BPF::GPRRegClassID]; + std::vector GPR32LiveDefs; + std::vector GPR64DeadDefs; + + for (const MachineOperand &MO : MI.operands()) { + bool RegIsGPR64; + + if (!MO.isReg() || MO.isUse()) + continue; + + RegIsGPR64 = GPR64RegClass->contains(MO.getReg()); + if (!MO.isDead()) { + // It is a GPR64 live Def, we are sure it is live. */ + if (RegIsGPR64) + return true; + // It is a GPR32 live Def, we are unsure whether it is really dead due to + // no sub-register liveness tracking. Push it to vector for deferred + // check. + GPR32LiveDefs.push_back(MO.getReg()); + continue; + } + + // Record any GPR64 dead Def as some unmarked GPR32 could be alias of its + // low 32-bit. + if (RegIsGPR64) + GPR64DeadDefs.push_back(MO.getReg()); + } + + // No GPR32 live Def, safe to return false. + if (GPR32LiveDefs.empty()) + return false; + + // No GPR64 dead Def, so all those GPR32 live Def can't have alias, therefore + // must be truely live, safe to return true. + if (GPR64DeadDefs.empty()) + return true; + + // Otherwise, return true if any aliased SuperReg of GPR32 is not dead. + for (auto I : GPR32LiveDefs) + for (MCPhysReg SR : TRI->superregs(I)) + if (!llvm::is_contained(GPR64DeadDefs, SR)) + return true; + + return false; +} + +void BPFMIPreEmitChecking::processAtomicInsts() { + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() != BPF::XADDW && MI.getOpcode() != BPF::XADDD) + continue; + + LLVM_DEBUG(MI.dump()); + if (hasLiveDefs(MI, TRI)) { + DebugLoc Empty; + const DebugLoc &DL = MI.getDebugLoc(); + const Function &F = MF->getFunction(); + F.getContext().diagnose(DiagnosticInfoUnsupported{ + F, "Invalid usage of the XADD return value", DL}); + } + } + } +} + +} // namespace + +INITIALIZE_PASS(BPFMIPreEmitChecking, "bpf-mi-pemit-checking", + "BPF PreEmit Checking", false, false) + +char BPFMIPreEmitChecking::ID = 0; +FunctionPass *llvm::createBPFMIPreEmitCheckingPass() { + return new BPFMIPreEmitChecking(); +} diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp index 64b115b8fc8afa..7d91fa8bb824cf 100644 --- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp +++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp @@ -178,6 +178,7 @@ void BPFPassConfig::addMachineSSAOptimization() { } void BPFPassConfig::addPreEmitPass() { + addPass(createBPFMIPreEmitCheckingPass()); if (getOptLevel() != CodeGenOptLevel::None) if (!DisableMIPeephole) addPass(createBPFMIPreEmitPeepholePass()); diff --git a/llvm/lib/Target/BPF/CMakeLists.txt b/llvm/lib/Target/BPF/CMakeLists.txt index 253660d4d62e37..eade4cacb7100e 100644 --- a/llvm/lib/Target/BPF/CMakeLists.txt +++ b/llvm/lib/Target/BPF/CMakeLists.txt @@ -39,6 +39,7 @@ add_llvm_target(BPFCodeGen BPFSubtarget.cpp BPFTargetMachine.cpp BPFMIPeephole.cpp + BPFMIChecking.cpp BPFMISimplifyPatchable.cpp BTFDebug.cpp diff --git a/llvm/test/CodeGen/BPF/atomics.ll b/llvm/test/CodeGen/BPF/atomics.ll index 0c16c49f2a873b..c17b94af5f7bd9 100644 --- a/llvm/test/CodeGen/BPF/atomics.ll +++ b/llvm/test/CodeGen/BPF/atomics.ll @@ -1,10 +1,11 @@ -; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding | FileCheck --check-prefixes=CHECK,CHECK-V2 %s -; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding -mcpu=v3 | FileCheck --check-prefixes=CHECK,CHECK-V3 %s +; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding | FileCheck %s +; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding -mcpu=v3 | FileCheck --check-prefix=CHECK-V3 %s ; CHECK-LABEL: test_load_add_32 -; CHECK-V2: r2 = atomic_fetch_add((u32 *)(r1 + 0), r2) +; CHECK: lock *(u32 *)(r1 + 0) += r2 +; CHECK: encoding: [0xc3,0x21 ; CHECK-V3: w2 = atomic_fetch_add((u32 *)(r1 + 0), w2) -; CHECK: encoding: [0xc3,0x21,0x00,0x00,0x01,0x00,0x00,0x00] +; CHECK-V3: encoding: [0xc3,0x21,0x00,0x00,0x01,0x00,0x00,0x00] define void @test_load_add_32(ptr %p, i32 zeroext %v) { entry: atomicrmw add ptr %p, i32 %v seq_cst @@ -12,8 +13,10 @@ entry: } ; CHECK-LABEL: test_load_add_64 -; CHECK: r2 = atomic_fetch_add((u64 *)(r1 + 0), r2) -; CHECK: encoding: [0xdb,0x21,0x00,0x00,0x01,0x00,0x00,0x00] +; CHECK: lock *(u64 *)(r1 + 0) += r2 +; CHECK: encoding: [0xdb,0x21 +; CHECK-V3: r2 = atomic_fetch_add((u64 *)(r1 + 0), r2) +; CHECK-V3: encoding: [0xdb,0x21,0x00,0x00,0x01,0x00,0x00,0x00] define void @test_load_add_64(ptr %p, i64 zeroext %v) { entry: atomicrmw add ptr %p, i64 %v seq_cst diff --git a/llvm/test/CodeGen/BPF/atomics_2.ll b/llvm/test/CodeGen/BPF/atomics_2.ll index c670ddb05b6a77..6371e3b875638e 100644 --- a/llvm/test/CodeGen/BPF/atomics_2.ll +++ b/llvm/test/CodeGen/BPF/atomics_2.ll @@ -224,7 +224,7 @@ entry: } ; CHECK-LABEL: test_atomic_xor_64 -; CHECK: r2 = atomic_fetch_xor((u64 *)(r1 + 0), r2) +; CHECK: atomic_fetch_xor((u64 *)(r1 + 0), r2) ; CHECK: encoding: [0xdb,0x21,0x00,0x00,0xa1,0x00,0x00,0x00] ; CHECK: w0 = 0 define dso_local i32 @test_atomic_xor_64(ptr nocapture %p, i64 %v) local_unnamed_addr { diff --git a/llvm/test/CodeGen/BPF/objdump_atomics.ll b/llvm/test/CodeGen/BPF/objdump_atomics.ll index c4cb16b2c36418..fcc889ba300e39 100644 --- a/llvm/test/CodeGen/BPF/objdump_atomics.ll +++ b/llvm/test/CodeGen/BPF/objdump_atomics.ll @@ -2,7 +2,7 @@ ; CHECK-LABEL: test_load_add_32 ; CHECK: c3 21 -; CHECK: w2 = atomic_fetch_add((u32 *)(r1 + 0), w2) +; CHECK: lock *(u32 *)(r1 + 0) += w2 define void @test_load_add_32(ptr %p, i32 zeroext %v) { entry: atomicrmw add ptr %p, i32 %v seq_cst @@ -11,7 +11,7 @@ entry: ; CHECK-LABEL: test_load_add_64 ; CHECK: db 21 -; CHECK: r2 = atomic_fetch_add((u64 *)(r1 + 0), r2) +; CHECK: lock *(u64 *)(r1 + 0) += r2 define void @test_load_add_64(ptr %p, i64 zeroext %v) { entry: atomicrmw add ptr %p, i64 %v seq_cst diff --git a/llvm/test/CodeGen/BPF/xadd.ll b/llvm/test/CodeGen/BPF/xadd.ll new file mode 100644 index 00000000000000..5aeeb9baf7b892 --- /dev/null +++ b/llvm/test/CodeGen/BPF/xadd.ll @@ -0,0 +1,59 @@ +; RUN: not llc -march=bpfel < %s 2>&1 | FileCheck %s +; RUN: not llc -march=bpfeb < %s 2>&1 | FileCheck %s + +; This file is generated with the source command and source +; $ clang -target bpf -O2 -g -S -emit-llvm t.c +; $ cat t.c +; int test(int *ptr) { +; int r; +; __sync_fetch_and_add(ptr, 4); +; r = __sync_fetch_and_add(ptr, 6); +; return r; +; } + +; ModuleID = 't.c' +source_filename = "t.c" +target datalayout = "e-m:e-p:64:64-i64:64-n32:64-S128" +target triple = "bpf" + +; Function Attrs: nounwind +define dso_local i32 @test(ptr nocapture %ptr) local_unnamed_addr #0 !dbg !7 { +entry: + call void @llvm.dbg.value(metadata ptr %ptr, metadata !13, metadata !DIExpression()), !dbg !15 + %0 = atomicrmw add ptr %ptr, i32 4 seq_cst, !dbg !16 + %1 = atomicrmw add ptr %ptr, i32 6 seq_cst, !dbg !17 +; CHECK: in function test i32 (ptr): Invalid usage of the XADD return value + call void @llvm.dbg.value(metadata i32 %1, metadata !14, metadata !DIExpression()), !dbg !18 + ret i32 %1, !dbg !19 +} + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, metadata, metadata) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone speculatable } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 8.0.0 (trunk 342605) (llvm/trunk 342612)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None) +!1 = !DIFile(filename: "t.c", directory: "/home/yhs/work/tests/llvm/sync/test1") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 8.0.0 (trunk 342605) (llvm/trunk 342612)"} +!7 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!8 = !DISubroutineType(types: !9) +!9 = !{!10, !11} +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64) +!12 = !{!13, !14} +!13 = !DILocalVariable(name: "ptr", arg: 1, scope: !7, file: !1, line: 1, type: !11) +!14 = !DILocalVariable(name: "r", scope: !7, file: !1, line: 2, type: !10) +!15 = !DILocation(line: 1, column: 15, scope: !7) +!16 = !DILocation(line: 3, column: 4, scope: !7) +!17 = !DILocation(line: 4, column: 8, scope: !7) +!18 = !DILocation(line: 2, column: 8, scope: !7) +!19 = !DILocation(line: 5, column: 4, scope: !7) diff --git a/llvm/test/CodeGen/BPF/xadd_legal.ll b/llvm/test/CodeGen/BPF/xadd_legal.ll index 88f04d85a779f8..9b07afade3fee9 100644 --- a/llvm/test/CodeGen/BPF/xadd_legal.ll +++ b/llvm/test/CodeGen/BPF/xadd_legal.ll @@ -19,7 +19,7 @@ define dso_local i32 @test(ptr nocapture %ptr, i64 %a) { entry: %conv = trunc i64 %a to i32 %0 = atomicrmw add ptr %ptr, i32 %conv seq_cst -; CHECK-64: r2 = atomic_fetch_add((u32 *)(r1 + 0), r2) +; CHECK-64: lock *(u32 *)(r1 + 0) += r2 ; CHECK-32: w2 = atomic_fetch_add((u32 *)(r1 + 0), w2) %1 = load i32, ptr %ptr, align 4 ret i32 %1 From d66765ddf1ae9e16676a49cebd966258f8b5c6e0 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 30 Aug 2024 21:01:09 +0000 Subject: [PATCH 97/98] [gn build] Port 06c531e808ce --- llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn index 243a92f2e62587..aa594df8c164a1 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn @@ -71,6 +71,7 @@ static_library("LLVMBPFCodeGen") { "BPFISelLowering.cpp", "BPFInstrInfo.cpp", "BPFMCInstLower.cpp", + "BPFMIChecking.cpp", "BPFMIPeephole.cpp", "BPFMISimplifyPatchable.cpp", "BPFPreserveDIType.cpp", From 02654f7370638889b989b4d776d35c3d47c87cdd Mon Sep 17 00:00:00 2001 From: Chris B Date: Fri, 30 Aug 2024 16:18:46 -0500 Subject: [PATCH 98/98] [HLSL][Doc] Document multi-argument resolution (#104474) This updates the expected diffferences document to capture the difference in multi-argument overload resolution between Clang and DXC. Fixes #99530 --- clang/docs/HLSL/ExpectedDifferences.rst | 121 +++++++++++++++++++++--- 1 file changed, 109 insertions(+), 12 deletions(-) diff --git a/clang/docs/HLSL/ExpectedDifferences.rst b/clang/docs/HLSL/ExpectedDifferences.rst index 4782eb3cda754a..e143c5b71575aa 100644 --- a/clang/docs/HLSL/ExpectedDifferences.rst +++ b/clang/docs/HLSL/ExpectedDifferences.rst @@ -54,6 +54,19 @@ HLSL 202x based on proposal and `0008 `_. +The largest difference between Clang and DXC's overload resolution is the +algorithm used for identifying best-match overloads. There are more details +about the algorithmic differences in the :ref:`multi_argument_overloads` section +below. There are three high level differences that should be highlighted: + +* **There should be no cases** where DXC and Clang both successfully + resolve an overload where the resolved overload is different between the two. +* There are cases where Clang will successfully resolve an overload that DXC + wouldn't because we've trimmed the overload set in Clang to remove ambiguity. +* There are cases where DXC will successfully resolve an overload that Clang + will not for two reasons: (1) DXC only generates partial overload sets for + builtin functions and (2) DXC resolves cases that probably should be ambiguous. + Clang's implementation extends standard overload resolution rules to HLSL library functionality. This causes subtle changes in overload resolution behavior between Clang and DXC. Some examples include: @@ -71,18 +84,23 @@ behavior between Clang and DXC. Some examples include: uint U; int I; float X, Y, Z; - double3 A, B; + double3 R, G; } - void twoParams(int, int); - void twoParams(float, float); + void takesSingleDouble(double); + void takesSingleDouble(vector); + + void scalarOrVector(double); + void scalarOrVector(vector); export void call() { - halfOrInt16(U); // DXC: Fails with call ambiguous between int16_t and uint16_t overloads - // Clang: Resolves to halfOrInt16(uint16_t). - halfOrInt16(I); // All: Resolves to halfOrInt16(int16_t). half H; + halfOrInt16(I); // All: Resolves to halfOrInt16(int16_t). + #ifndef IGNORE_ERRORS + halfOrInt16(U); // All: Fails with call ambiguous between int16_t and uint16_t + // overloads + // asfloat16 is a builtin with overloads for half, int16_t, and uint16_t. H = asfloat16(I); // DXC: Fails to resolve overload for int. // Clang: Resolves to asfloat16(int16_t). @@ -94,21 +112,28 @@ behavior between Clang and DXC. Some examples include: takesDoubles(X, Y, Z); // Works on all compilers #ifndef IGNORE_ERRORS - fma(X, Y, Z); // DXC: Fails to resolve no known conversion from float to double. + fma(X, Y, Z); // DXC: Fails to resolve no known conversion from float to + // double. // Clang: Resolves to fma(double,double,double). - #endif - double D = dot(A, B); // DXC: Resolves to dot(double3, double3), fails DXIL Validation. + double D = dot(R, G); // DXC: Resolves to dot(double3, double3), fails DXIL Validation. // FXC: Expands to compute double dot product with fmul/fadd - // Clang: Resolves to dot(float3, float3), emits conversion warnings. + // Clang: Fails to resolve as ambiguous against + // dot(half, half) or dot(float, float) + #endif #ifndef IGNORE_ERRORS tan(B); // DXC: resolves to tan(float). // Clang: Fails to resolve, ambiguous between integer types. - twoParams(I, X); // DXC: resolves twoParams(int, int). - // Clang: Fails to resolve ambiguous conversions. #endif + + double D; + takesSingleDouble(D); // All: Fails to resolve ambiguous conversions. + takesSingleDouble(R); // All: Fails to resolve ambiguous conversions. + + scalarOrVector(D); // All: Resolves to scalarOrVector(double). + scalarOrVector(R); // All: Fails to resolve ambiguous conversions. } .. note:: @@ -119,3 +144,75 @@ behavior between Clang and DXC. Some examples include: diagnostic notifying the user of the conversion rather than silently altering precision relative to the other overloads (as FXC does) or generating code that will fail validation (as DXC does). + +.. _multi_argument_overloads: + +Multi-Argument Overloads +------------------------ + +In addition to the differences in single-element conversions, Clang and DXC +differ dramatically in multi-argument overload resolution. C++ multi-argument +overload resolution behavior (or something very similar) is required to +implement +`non-member operator overloading `_. + +Clang adopts the C++ inspired language from the +`draft HLSL specification `_, +where an overload ``f1`` is a better candidate than ``f2`` if for all arguments the +conversion sequences is not worse than the corresponding conversion sequence and +for at least one argument it is better. + +.. code-block:: c++ + + cbuffer CB { + int I; + float X; + float4 V; + } + + void twoParams(int, int); + void twoParams(float, float); + void threeParams(float, float, float); + void threeParams(float4, float4, float4); + + export void call() { + twoParams(I, X); // DXC: resolves twoParams(int, int). + // Clang: Fails to resolve ambiguous conversions. + + threeParams(X, V, V); // DXC: resolves threeParams(float4, float4, float4). + // Clang: Fails to resolve ambiguous conversions. + } + +For the examples above since ``twoParams`` called with mixed parameters produces +implicit conversion sequences that are { ExactMatch, FloatingIntegral } and { +FloatingIntegral, ExactMatch }. In both cases an argument has a worse conversion +in the other sequence, so the overload is ambiguous. + +In the ``threeParams`` example the sequences are { ExactMatch, VectorTruncation, +VectorTruncation } or { VectorSplat, ExactMatch, ExactMatch }, again in both +cases at least one parameter has a worse conversion in the other sequence, so +the overload is ambiguous. + +.. note:: + + The behavior of DXC documented below is undocumented so this is gleaned from + observation and a bit of reading the source. + +DXC's approach for determining the best overload produces an integer score value +for each implicit conversion sequence for each argument expression. Scores for +casts are based on a bitmask construction that is complicated to reverse +engineer. It seems that: + +* Exact match is 0 +* Dimension increase is 1 +* Promotion is 2 +* Integral -> Float conversion is 4 +* Float -> Integral conversion is 8 +* Cast is 16 + +The masks are or'd against each other to produce a score for the cast. + +The scores of each conversion sequence are then summed to generate a score for +the overload candidate. The overload candidate with the lowest score is the best +candidate. If more than one overload are matched for the lowest score the call +is ambiguous.