diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 0e7f6901e7ce8..d47c42a0ade52 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20506,10 +20506,17 @@ GenTree* Compiler::gtNewSimdAbsNode(var_types type, GenTree* op1, CorInfoType si GenTree* bitMask; - bitMask = gtNewDconNode(-0.0, simdBaseType); - bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, simdBaseJitType, simdSize); - - return gtNewSimdBinOpNode(GT_AND_NOT, type, op1, bitMask, simdBaseJitType, simdSize); + if (simdBaseType == TYP_FLOAT) + { + bitMask = gtNewIconNode(0x7FFFFFFF); + bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_INT, simdSize); + } + else + { + bitMask = gtNewLconNode(0x7FFFFFFFFFFFFFFF); + bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_LONG, simdSize); + } + return gtNewSimdBinOpNode(GT_AND, type, op1, bitMask, simdBaseJitType, simdSize); } NamedIntrinsic intrinsic = NI_Illegal; @@ -20750,12 +20757,6 @@ GenTree* Compiler::gtNewSimdBinOpNode( } } } - - if (op == GT_AND_NOT) - { - // GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2` - needsReverseOps = true; - } break; } #endif // TARGET_XARCH @@ -20786,11 +20787,34 @@ GenTree* Compiler::gtNewSimdBinOpNode( if (intrinsic != NI_Illegal) { + if (op == GT_AND_NOT) + { + assert(fgNodeThreading == NodeThreading::LIR); + +#if defined(TARGET_XARCH) + // GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2` + // We specially handle this here since we're only producing a + // native intrinsic node in LIR + + std::swap(op1, op2); +#endif // TARGET_XARCH + } return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize); } switch (op) { + case GT_AND_NOT: + { + // Prior to LIR, we want to explicitly decompose this operation so that downstream phases can + // appropriately optimize around the individual operations being performed, particularly ~op2, + // and produce overall better codegen. + assert(fgNodeThreading != NodeThreading::LIR); + + op2 = gtNewSimdUnOpNode(GT_NOT, type, op2, simdBaseJitType, simdSize); + return gtNewSimdBinOpNode(GT_AND, type, op1, op2, simdBaseJitType, simdSize); + } + #if defined(TARGET_XARCH) case GT_RSZ: { @@ -20955,9 +20979,6 @@ GenTree* Compiler::gtNewSimdBinOpNode( vecCon1->gtSimdVal.u64[i] = 0x00FF00FF00FF00FF; } - // Validate we can't use AVX512F_VL_TernaryLogic here - assert(!canUseEvexEncodingDebugOnly()); - // Vector256 maskedProduct = Avx2.And(widenedProduct, vecCon1).AsInt16() GenTree* maskedProduct = gtNewSimdBinOpNode(GT_AND, widenedType, widenedProduct, vecCon1, widenedSimdBaseJitType, widenedSimdSize); @@ -21922,9 +21943,6 @@ GenTree* Compiler::gtNewSimdCmpOpNode( v = gtNewSimdHWIntrinsicNode(type, v, gtNewIconNode(SHUFFLE_ZZXX, TYP_INT), NI_SSE2_Shuffle, CORINFO_TYPE_INT, simdSize); - // Validate we can't use AVX512F_VL_TernaryLogic here - assert(!canUseEvexEncodingDebugOnly()); - op2 = gtNewSimdBinOpNode(GT_AND, type, u, v, simdBaseJitType, simdSize); return gtNewSimdBinOpNode(GT_OR, type, op1, op2, simdBaseJitType, simdSize); } @@ -24315,9 +24333,6 @@ GenTree* Compiler::gtNewSimdNarrowNode( GenTree* vecCon2 = gtCloneExpr(vecCon1); - // Validate we can't use AVX512F_VL_TernaryLogic here - assert(!canUseEvexEncodingDebugOnly()); - tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize); tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize); tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE, @@ -24356,9 +24371,6 @@ GenTree* Compiler::gtNewSimdNarrowNode( GenTree* vecCon2 = gtCloneExpr(vecCon1); - // Validate we can't use AVX512F_VL_TernaryLogic here - assert(!canUseEvexEncodingDebugOnly()); - tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize); tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize); tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_USHORT, @@ -24460,9 +24472,6 @@ GenTree* Compiler::gtNewSimdNarrowNode( GenTree* vecCon2 = gtCloneExpr(vecCon1); - // Validate we can't use AVX512F_VL_TernaryLogic here - assert(!canUseEvexEncodingDebugOnly()); - tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize); tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize); @@ -24499,9 +24508,6 @@ GenTree* Compiler::gtNewSimdNarrowNode( GenTree* vecCon2 = gtCloneExpr(vecCon1); - // Validate we can't use AVX512F_VL_TernaryLogic here - assert(!canUseEvexEncodingDebugOnly()); - tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize); tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize); @@ -28120,6 +28126,14 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, assert(!isScalar); assert(op2->TypeIs(simdType)); + if (comp->fgNodeThreading != NodeThreading::LIR) + { + // We don't want to support creating AND_NOT nodes prior to LIR + // as it can break important optimizations. We'll produces this + // in lowering instead. + break; + } + #if defined(TARGET_XARCH) if (simdSize == 64) { @@ -29187,6 +29201,21 @@ bool GenTreeHWIntrinsic::ShouldConstantProp(GenTree* operand, GenTreeVecCon* vec return IsUserCall() && (operand == Op(2)); } +#if defined(TARGET_XARCH) + case NI_SSE_Xor: + case NI_SSE2_Xor: + case NI_AVX_Xor: + case NI_AVX2_Xor: + case NI_AVX512F_Xor: + case NI_AVX512DQ_Xor: + case NI_AVX10v1_V512_Xor: + { + // We recognize this as GT_NOT which can enable other optimizations + assert(GetOperandCount() == 2); + return vecCon->IsVectorAllBitsSet(); + } +#endif // TARGET_XARCH + default: { break; @@ -29936,7 +29965,8 @@ bool GenTreeLclVar::IsNeverNegative(Compiler* comp) const unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3) { #if defined(TARGET_XARCH) - assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId)); + assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId) || + HWIntrinsicInfo::IsTernaryLogic(gtHWIntrinsicId)); #elif defined(TARGET_ARM64) assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId)); #endif @@ -29980,85 +30010,6 @@ unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree return 0; } - -//------------------------------------------------------------------------ -// GetTernaryControlByte: calculate the value of the control byte for ternary node -// with given logic nodes on the input. -// -// Return value: the value of the ternary control byte. -uint8_t GenTreeHWIntrinsic::GetTernaryControlByte(GenTreeHWIntrinsic* second) const -{ - // we assume we have a structure like: - /* - /- A - +- B - t1 = binary logical op1 - - /- C - +- t1 - t2 = binary logical op2 - */ - - // To calculate the control byte value: - // The way the constants work is we have three keys: - // * A: 0xF0 - // * B: 0xCC - // * C: 0xAA - // - // To compute the correct control byte, you simply perform the corresponding operation on these keys. So, if you - // wanted to do (A & B) ^ C, you would compute (0xF0 & 0xCC) ^ 0xAA or 0x6A. - assert(second->Op(1) == this || second->Op(2) == this); - const uint8_t A = 0xF0; - const uint8_t B = 0xCC; - const uint8_t C = 0xAA; - - bool isScalar = false; - - genTreeOps firstOper = GetOperForHWIntrinsicId(&isScalar); - assert(!isScalar); - - genTreeOps secondOper = second->GetOperForHWIntrinsicId(&isScalar); - assert(!isScalar); - - uint8_t AB = 0; - uint8_t ABC = 0; - - if (firstOper == GT_AND) - { - AB = A & B; - } - else if (firstOper == GT_OR) - { - AB = A | B; - } - else if (firstOper == GT_XOR) - { - AB = A ^ B; - } - else - { - unreached(); - } - - if (secondOper == GT_AND) - { - ABC = AB & C; - } - else if (secondOper == GT_OR) - { - ABC = AB | C; - } - else if (secondOper == GT_XOR) - { - ABC = AB ^ C; - } - else - { - unreached(); - } - - return ABC; -} #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS unsigned GenTreeLclFld::GetSize() const @@ -30454,13 +30405,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) bool isScalar = false; genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar); -#if defined(TARGET_XARCH) - if (oper == GT_AND_NOT) - { - // xarch does: ~op1 & op2, we need op1 & ~op2 - std::swap(op1, op2); - } -#endif // TARGET_XARCH + // We shouldn't find AND_NOT nodes since it should only be produced in lowering + assert(oper != GT_AND_NOT); GenTree* cnsNode = nullptr; GenTree* otherNode = nullptr; @@ -30973,31 +30919,6 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) break; } - case GT_AND_NOT: - { - // Handle `x & ~0 == x` and `0 & ~x == 0` - if (cnsNode->IsVectorZero()) - { - if (cnsNode == op1) - { - resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT); - break; - } - else - { - resultNode = otherNode; - } - break; - } - - // Handle `x & ~AllBitsSet == 0` - if (cnsNode->IsVectorAllBitsSet() && (cnsNode == op2)) - { - resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT); - } - break; - } - case GT_DIV: { if (varTypeIsFloating(simdBaseType)) @@ -31388,12 +31309,12 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) { switch (ni) { - case NI_Vector128_ConditionalSelect: #if defined(TARGET_XARCH) + case NI_Vector128_ConditionalSelect: case NI_Vector256_ConditionalSelect: case NI_Vector512_ConditionalSelect: #elif defined(TARGET_ARM64) - case NI_Vector64_ConditionalSelect: + case NI_AdvSimd_BitwiseSelect: case NI_Sve_ConditionalSelect: #endif { diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 596da62ac7f7d..5b7bc8278cab7 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6527,7 +6527,6 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic bool OperRequiresGlobRefFlag() const; unsigned GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3); - uint8_t GetTernaryControlByte(GenTreeHWIntrinsic* second) const; ClassLayout* GetLayout(Compiler* compiler) const; diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 06b6eb429caa9..5af8e67d67689 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -352,6 +352,365 @@ const TernaryLogicInfo& TernaryLogicInfo::lookup(uint8_t control) return ternaryLogicFlags[control]; } + +//------------------------------------------------------------------------ +// GetTernaryControlByte: Get the control byte for a TernaryLogic operation +// given the oper and two existing control bytes +// +// Arguments: +// oper -- the operation being performed +// op1 -- the control byte for op1 +// op2 -- the control byte for op2 +// +// Return Value: +// The new control byte evaluated from performing oper on op1 and op2 +// +uint8_t TernaryLogicInfo::GetTernaryControlByte(genTreeOps oper, uint8_t op1, uint8_t op2) +{ + switch (oper) + { + case GT_AND: + { + return static_cast(op1 & op2); + } + + case GT_AND_NOT: + { + return static_cast(~op1 & op2); + } + + case GT_OR: + { + return static_cast(op1 | op2); + } + + case GT_XOR: + { + return static_cast(op1 ^ op2); + } + + default: + { + unreached(); + } + } +} + +//------------------------------------------------------------------------ +// GetTernaryControlByte: Get the control byte for a TernaryLogic operation +// given a ternary logic oper and two inputs +// +// Arguments: +// oper -- the operation being performed +// op1 -- the control byte for op1, this is ignored for unary oper +// op2 -- the control byte for op2 +// +// Return Value: +// The new control byte evaluated from performing oper on op1 and op2 +// +uint8_t TernaryLogicInfo::GetTernaryControlByte(TernaryLogicOperKind oper, uint8_t op1, uint8_t op2) +{ + switch (oper) + { + case TernaryLogicOperKind::Select: + { + return op2; + } + + case TernaryLogicOperKind::Not: + { + return ~op2; + } + + case TernaryLogicOperKind::And: + { + return op1 & op2; + } + + case TernaryLogicOperKind::Nand: + { + return ~(op1 & op2); + } + + case TernaryLogicOperKind::Or: + { + return op1 | op2; + } + + case TernaryLogicOperKind::Nor: + { + return ~(op1 | op2); + } + + case TernaryLogicOperKind::Xor: + { + return op1 ^ op2; + } + + case TernaryLogicOperKind::Xnor: + { + return ~(op1 ^ op2); + } + + default: + { + unreached(); + } + } +} + +//------------------------------------------------------------------------ +// GetTernaryControlByte: Get the control byte for a TernaryLogic operation +// given an existing info and three control bytes +// +// Arguments: +// info -- the info describing the operation being performed +// op1 -- the control byte for op1 +// op2 -- the control byte for op2 +// op3 -- the control byte for op3 +// +// Return Value: +// The new control byte evaluated from performing info on op1, op2, and op3 +// +uint8_t TernaryLogicInfo::GetTernaryControlByte(const TernaryLogicInfo& info, uint8_t op1, uint8_t op2, uint8_t op3) +{ + uint8_t oper1Result; + + switch (info.oper1Use) + { + case TernaryLogicUseFlags::None: + { + assert(info.oper2 == TernaryLogicOperKind::None); + assert(info.oper2Use == TernaryLogicUseFlags::None); + + assert(info.oper3 == TernaryLogicOperKind::None); + assert(info.oper3Use == TernaryLogicUseFlags::None); + + switch (info.oper1) + { + case TernaryLogicOperKind::False: + { + oper1Result = 0x00; + break; + } + + case TernaryLogicOperKind::True: + { + oper1Result = 0xFF; + break; + } + + default: + { + unreached(); + } + } + break; + } + + case TernaryLogicUseFlags::A: + { + oper1Result = GetTernaryControlByte(info.oper1, 0x00, op1); + break; + } + + case TernaryLogicUseFlags::B: + { + oper1Result = GetTernaryControlByte(info.oper1, 0x00, op2); + break; + } + + case TernaryLogicUseFlags::C: + { + oper1Result = GetTernaryControlByte(info.oper1, 0x00, op3); + break; + } + + case TernaryLogicUseFlags::AB: + { + oper1Result = GetTernaryControlByte(info.oper1, op1, op2); + break; + } + + case TernaryLogicUseFlags::AC: + { + oper1Result = GetTernaryControlByte(info.oper1, op1, op3); + break; + } + + case TernaryLogicUseFlags::BC: + { + oper1Result = GetTernaryControlByte(info.oper1, op2, op3); + break; + } + + case TernaryLogicUseFlags::ABC: + { + assert(info.oper2 == TernaryLogicOperKind::None); + assert(info.oper2Use == TernaryLogicUseFlags::None); + + assert(info.oper3 == TernaryLogicOperKind::None); + assert(info.oper3Use == TernaryLogicUseFlags::None); + + switch (info.oper1) + { + case TernaryLogicOperKind::Nor: + { + oper1Result = ~(op1 | op2 | op3); + break; + } + + case TernaryLogicOperKind::Minor: + { + oper1Result = 0x17; + break; + } + + case TernaryLogicOperKind::Xnor: + { + oper1Result = ~(op1 ^ op2 ^ op3); + break; + } + + case TernaryLogicOperKind::Nand: + { + oper1Result = ~(op1 & op2 & op3); + break; + } + + case TernaryLogicOperKind::And: + { + oper1Result = op1 & op2 & op3; + break; + } + + case TernaryLogicOperKind::Xor: + { + oper1Result = op1 ^ op2 ^ op3; + break; + } + + case TernaryLogicOperKind::Major: + { + oper1Result = 0xE8; + break; + } + + case TernaryLogicOperKind::Or: + { + oper1Result = op1 | op2 | op3; + break; + } + + default: + { + unreached(); + } + } + break; + } + + default: + { + unreached(); + } + } + + uint8_t oper2Result; + + switch (info.oper2Use) + { + case TernaryLogicUseFlags::None: + { + assert(info.oper3 == TernaryLogicOperKind::None); + assert(info.oper3Use == TernaryLogicUseFlags::None); + + oper2Result = oper1Result; + break; + } + + case TernaryLogicUseFlags::A: + { + oper2Result = GetTernaryControlByte(info.oper2, oper1Result, op1); + break; + } + + case TernaryLogicUseFlags::B: + { + oper2Result = GetTernaryControlByte(info.oper2, oper1Result, op2); + break; + } + + case TernaryLogicUseFlags::C: + { + oper2Result = GetTernaryControlByte(info.oper2, oper1Result, op3); + break; + } + + case TernaryLogicUseFlags::AB: + { + oper2Result = GetTernaryControlByte(info.oper2, op1, op2); + break; + } + + case TernaryLogicUseFlags::AC: + { + oper2Result = GetTernaryControlByte(info.oper2, op1, op3); + break; + } + + case TernaryLogicUseFlags::BC: + { + oper2Result = GetTernaryControlByte(info.oper2, op2, op3); + break; + } + + default: + { + unreached(); + } + } + + uint8_t oper3Result; + + switch (info.oper3Use) + { + case TernaryLogicUseFlags::None: + { + assert(info.oper3 == TernaryLogicOperKind::None); + oper3Result = oper2Result; + break; + } + + case TernaryLogicUseFlags::A: + { + assert(info.oper3 == TernaryLogicOperKind::Cond); + oper3Result = (oper1Result & op1) | (oper2Result & ~op1); + break; + } + + case TernaryLogicUseFlags::B: + { + assert(info.oper3 == TernaryLogicOperKind::Cond); + oper3Result = (oper1Result & op2) | (oper2Result & ~op2); + break; + } + + case TernaryLogicUseFlags::C: + { + assert(info.oper3 == TernaryLogicOperKind::Cond); + oper3Result = (oper1Result & op3) | (oper2Result & ~op3); + break; + } + + default: + { + unreached(); + } + } + + return oper3Result; +} #endif // TARGET_XARCH //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index 915657e014631..52cb9eb9a4806 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -481,6 +481,10 @@ struct TernaryLogicInfo static const TernaryLogicInfo& lookup(uint8_t control); + static uint8_t GetTernaryControlByte(genTreeOps oper, uint8_t op1, uint8_t op2); + static uint8_t GetTernaryControlByte(TernaryLogicOperKind oper, uint8_t op1, uint8_t op2); + static uint8_t GetTernaryControlByte(const TernaryLogicInfo& info, uint8_t op1, uint8_t op2, uint8_t op3); + TernaryLogicUseFlags GetAllUseFlags() const { uint8_t useFlagsBits = 0; @@ -1024,6 +1028,11 @@ struct HWIntrinsicInfo HWIntrinsicFlag flags = lookupFlags(id); return (flags & HW_Flag_PermuteVar2x) != 0; } + + static bool IsTernaryLogic(NamedIntrinsic id) + { + return (id == NI_AVX512F_TernaryLogic) || (id == NI_AVX512F_VL_TernaryLogic) || (id == NI_AVX10v1_TernaryLogic); + } #endif // TARGET_XARCH #if defined(TARGET_ARM64) diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 2827565569e42..a3fed94ee71a8 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -611,15 +611,39 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_AdvSimd_BitwiseClear: case NI_Vector64_AndNot: case NI_Vector128_AndNot: { assert(sig->numArgs == 2); + // We don't want to support creating AND_NOT nodes prior to LIR + // as it can break important optimizations. We'll produces this + // in lowering instead so decompose into the individual operations + // on import + op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdBinOpNode(GT_AND_NOT, retType, op1, op2, simdBaseJitType, simdSize); + op2 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize)); + retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize); + break; + } + + case NI_AdvSimd_OrNot: + { + assert(sig->numArgs == 2); + + // We don't want to support creating OR_NOT nodes prior to LIR + // as it can break important optimizations. We'll produces this + // in lowering instead so decompose into the individual operations + // on import + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + op2 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize)); + retNode = gtNewSimdBinOpNode(GT_OR, retType, op1, op2, simdBaseJitType, simdSize); break; } diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index d96690c400360..ed26f3d7490b0 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1298,9 +1298,7 @@ void CodeGen::genHWIntrinsic_R_R_R_RM_I( // non-RMW based codegen. #if defined(DEBUG) - NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); - assert((intrinsicId == NI_AVX512F_TernaryLogic) || (intrinsicId == NI_AVX512F_VL_TernaryLogic) || - (intrinsicId == NI_AVX10v1_TernaryLogic)); + assert(HWIntrinsicInfo::IsTernaryLogic(node->GetHWIntrinsicId())); uint8_t control = static_cast(ival); const TernaryLogicInfo& info = TernaryLogicInfo::lookup(control); @@ -1311,6 +1309,19 @@ void CodeGen::genHWIntrinsic_R_R_R_RM_I( op2Reg = targetReg; } + else + { +#if defined(DEBUG) + if (HWIntrinsicInfo::IsTernaryLogic(node->GetHWIntrinsicId())) + { + uint8_t control = static_cast(ival); + const TernaryLogicInfo& info = TernaryLogicInfo::lookup(control); + TernaryLogicUseFlags useFlags = info.GetAllUseFlags(); + + assert(useFlags == TernaryLogicUseFlags::BC); + } +#endif // DEBUG + } } assert(targetReg != REG_NA); @@ -2856,6 +2867,46 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node, insOpts instOption break; } + case NI_EVEX_XnorMask: + { + assert(instOptions == INS_OPTS_NONE); + + uint32_t simdSize = node->GetSimdSize(); + uint32_t count = simdSize / genTypeSize(baseType); + + if (count <= 8) + { + assert((count == 2) || (count == 4) || (count == 8)); + ins = INS_kxnorb; + } + else if (count == 16) + { + ins = INS_kxnorw; + } + else if (count == 32) + { + ins = INS_kxnord; + } + else + { + assert(count == 64); + ins = INS_kxnorq; + } + + op1Reg = op1->GetRegNum(); + + GenTree* op2 = node->Op(2); + regNumber op2Reg = op2->GetRegNum(); + + assert(emitter::isMaskReg(targetReg)); + assert(emitter::isMaskReg(op1Reg)); + assert(emitter::isMaskReg(op2Reg)); + + // Use EA_32BYTE to ensure the VEX.L bit gets set + emit->emitIns_R_R_R(ins, EA_32BYTE, targetReg, op1Reg, op2Reg); + break; + } + case NI_AVX512F_ConvertToInt32: case NI_AVX512F_ConvertToUInt32: case NI_AVX512F_ConvertToUInt32WithTruncation: diff --git a/src/coreclr/jit/hwintrinsiclistarm64.h b/src/coreclr/jit/hwintrinsiclistarm64.h index 6f3130264e7d7..e5c54aabfd276 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64.h +++ b/src/coreclr/jit/hwintrinsiclistarm64.h @@ -263,7 +263,7 @@ HARDWARE_INTRINSIC(AdvSimd, AddScalar, HARDWARE_INTRINSIC(AdvSimd, AddWideningLower, 8, 2, true, {INS_saddl, INS_uaddl, INS_saddl, INS_uaddl, INS_saddl, INS_uaddl, INS_saddw, INS_uaddw, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromSecondArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AdvSimd, AddWideningUpper, 16, 2, true, {INS_saddl2, INS_uaddl2, INS_saddl2, INS_uaddl2, INS_saddl2, INS_uaddl2, INS_saddw2, INS_uaddw2, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromSecondArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AdvSimd, And, -1, 2, true, {INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and}, HW_Category_SIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AdvSimd, BitwiseClear, -1, 2, true, {INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic}, HW_Category_SIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AdvSimd, BitwiseClear, -1, 2, true, {INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic}, HW_Category_SIMD, HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AdvSimd, BitwiseSelect, -1, 3, true, {INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl}, HW_Category_SIMD, HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AdvSimd, Ceiling, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_frintp, INS_invalid}, HW_Category_SIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AdvSimd, CeilingScalar, 8, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_frintp, INS_frintp}, HW_Category_SIMD, HW_Flag_SIMDScalar) @@ -401,7 +401,7 @@ HARDWARE_INTRINSIC(AdvSimd, NegateSaturate, HARDWARE_INTRINSIC(AdvSimd, NegateScalar, 8, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_fneg, INS_fneg}, HW_Category_SIMD, HW_Flag_SIMDScalar) HARDWARE_INTRINSIC(AdvSimd, Not, -1, 1, true, {INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn}, HW_Category_SIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AdvSimd, Or, -1, 2, true, {INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr}, HW_Category_SIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AdvSimd, OrNot, -1, 2, true, {INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn}, HW_Category_SIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AdvSimd, OrNot, -1, 2, true, {INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn}, HW_Category_SIMD, HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AdvSimd, PolynomialMultiply, -1, 2, true, {INS_pmul, INS_pmul, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AdvSimd, PolynomialMultiplyWideningLower, 8, 2, true, {INS_pmull, INS_pmull, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_Commutative) HARDWARE_INTRINSIC(AdvSimd, PolynomialMultiplyWideningUpper, 16, 2, true, {INS_pmull2, INS_pmull2, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_Commutative) diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 4513494d6dd02..7efd7c6d284a5 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -388,7 +388,7 @@ HARDWARE_INTRINSIC(X86Base_X64, DivRem, HARDWARE_INTRINSIC(SSE, Add, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE, AddScalar, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, And, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(SSE, AndNot, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(SSE, AndNot, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE, CompareEqual, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareGreaterThan, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareGreaterThanOrEqual, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) @@ -468,7 +468,7 @@ HARDWARE_INTRINSIC(SSE, Subtract, HARDWARE_INTRINSIC(SSE, SubtractScalar, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, UnpackHigh, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE, UnpackLow, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(SSE, Xor, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(SSE, Xor, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) #define LAST_NI_SSE NI_SSE_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -492,7 +492,7 @@ HARDWARE_INTRINSIC(SSE2, Add, HARDWARE_INTRINSIC(SSE2, AddSaturate, 16, 2, true, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, AddScalar, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE2, And, 16, 2, true, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(SSE2, AndNot, 16, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(SSE2, AndNot, 16, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, Average, 16, 2, true, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, CompareEqual, 16, 2, true, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareGreaterThan, 16, 2, true, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) @@ -590,7 +590,7 @@ HARDWARE_INTRINSIC(SSE2, SubtractScalar, HARDWARE_INTRINSIC(SSE2, SumAbsoluteDifferences, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2, UnpackHigh, 16, 2, true, {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_invalid, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, UnpackLow, 16, 2, true, {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_invalid, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(SSE2, Xor, 16, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(SSE2, Xor, 16, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) #define LAST_NI_SSE2 NI_SSE2_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -723,7 +723,7 @@ HARDWARE_INTRINSIC(SSE42_X64, Crc32, HARDWARE_INTRINSIC(AVX, Add, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX, AddSubtract, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, And, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX, Blend, 32, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector128, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) @@ -791,7 +791,7 @@ HARDWARE_INTRINSIC(AVX, TestNotZAndNotC, HARDWARE_INTRINSIC(AVX, TestZ, -1, 2, true, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, UnpackHigh, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX, UnpackLow, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX, Xor, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX, Xor, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) #define LAST_NI_AVX NI_AVX_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -805,7 +805,7 @@ HARDWARE_INTRINSIC(AVX2, Add, HARDWARE_INTRINSIC(AVX2, AddSaturate, 32, 2, true, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, AlignRight, 32, 3, false, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, And, 32, 2, false, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX2, AndNot, 32, 2, false, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX2, AndNot, 32, 2, false, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, Average, 32, 2, true, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, Blend, -1, 3, true, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, false, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) @@ -865,7 +865,7 @@ HARDWARE_INTRINSIC(AVX2, SubtractSaturate, HARDWARE_INTRINSIC(AVX2, SumAbsoluteDifferences, 32, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, UnpackHigh, 32, 2, true, {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, UnpackLow, 32, 2, true, {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX2, Xor, 32, 2, false, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX2, Xor, 32, 2, false, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) #define LAST_NI_AVX2 NI_AVX2_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -880,7 +880,7 @@ HARDWARE_INTRINSIC(AVX512F, AddScalar, HARDWARE_INTRINSIC(AVX512F, AlignRight32, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignd, INS_valignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, AlignRight64, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignq, INS_valignq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, And, 64, 2, true, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, BlendVariable, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad) @@ -1002,7 +1002,7 @@ HARDWARE_INTRINSIC(AVX512F, SubtractScalar, HARDWARE_INTRINSIC(AVX512F, TernaryLogic, 64, 4, true, {INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogq, INS_vpternlogq, INS_vpternlogd, INS_vpternlogq}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, UnpackHigh, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, UnpackLow, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) #define LAST_NI_AVX512F NI_AVX512F_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -1183,7 +1183,7 @@ HARDWARE_INTRINSIC(AVX512CD_VL, LeadingZeroCount, // AVX512DQ Intrinsics #define FIRST_NI_AVX512DQ NI_AVX512DQ_And HARDWARE_INTRINSIC(AVX512DQ, And, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX512DQ, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX512DQ, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512DQ, BroadcastPairScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_vbroadcastf32x2, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512DQ, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x2, INS_vbroadcasti64x2, INS_invalid, INS_vbroadcastf64x2}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512DQ, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x8, INS_vbroadcasti32x8, INS_invalid, INS_invalid, INS_vbroadcastf32x8, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) @@ -1203,7 +1203,7 @@ HARDWARE_INTRINSIC(AVX512DQ, Range, HARDWARE_INTRINSIC(AVX512DQ, RangeScalar, 16, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangess, INS_vrangesd}, HW_Category_IMM, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512DQ, Reduce, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreduceps, INS_vreducepd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512DQ, ReduceScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreducess, INS_vreducesd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(AVX512DQ, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX512DQ, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) #define LAST_NI_AVX512DQ NI_AVX512DQ_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -1368,7 +1368,7 @@ HARDWARE_INTRINSIC(AVX10v1, TernaryLogic, // AVX10V1_V512 Intrinsics #define FIRST_NI_AVX10v1_V512 NI_AVX10v1_V512_And HARDWARE_INTRINSIC(AVX10v1_V512, And, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX10v1_V512, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1_V512, BroadcastPairScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_vbroadcastf32x2, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX10v1_V512, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x2, INS_vbroadcasti64x2, INS_invalid, INS_vbroadcastf64x2}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX10v1_V512, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x8, INS_vbroadcasti32x8, INS_invalid, INS_invalid, INS_vbroadcastf32x8, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) @@ -1391,7 +1391,7 @@ HARDWARE_INTRINSIC(AVX10v1_V512, PermuteVar64x8, HARDWARE_INTRINSIC(AVX10v1_V512, PermuteVar64x8x2, 64, 3, false, {INS_vpermt2b, INS_vpermt2b, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1_V512, Range, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangeps, INS_vrangepd}, HW_Category_IMM, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1_V512, Reduce, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreduceps, INS_vreducepd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX10v1_V512, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) #define LAST_NI_AVX10v1_V512 NI_AVX10v1_V512_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -1438,7 +1438,7 @@ HARDWARE_INTRINSIC(AES, KeygenAssist, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI1 Intrinsics #define FIRST_NI_BMI1 NI_BMI1_AndNot -HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_SpecialImport|HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1, BitFieldExtract, 0, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1, ExtractLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1, GetMaskUpToLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed) @@ -1452,7 +1452,7 @@ HARDWARE_INTRINSIC(BMI1, TrailingZeroCount, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI1 Intrinsics #define FIRST_NI_BMI1_X64 NI_BMI1_X64_AndNot -HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_SpecialImport|HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1_X64, BitFieldExtract, 0, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1_X64, ExtractLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1_X64, GetMaskUpToLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed) @@ -1599,6 +1599,7 @@ HARDWARE_INTRINSIC(EVEX, OrMask, HARDWARE_INTRINSIC(EVEX, ShiftLeftMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(EVEX, ShiftRightMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(EVEX, XorMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(EVEX, XnorMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) #endif // FEATURE_HW_INTRINSIC diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 64adeadcab50a..74566386c2c29 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1394,19 +1394,62 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_SSE_AndNot: + case NI_SSE2_AndNot: + case NI_AVX_AndNot: + case NI_AVX2_AndNot: + case NI_AVX512F_AndNot: + case NI_AVX512DQ_AndNot: + case NI_AVX10v1_V512_AndNot: + { + assert(sig->numArgs == 2); + + // We don't want to support creating AND_NOT nodes prior to LIR + // as it can break important optimizations. We'll produces this + // in lowering instead so decompose into the individual operations + // on import, taking into account that despite the name, these APIs + // do (~op1 & op2), so we need to account for that + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + op1 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op1, simdBaseJitType, simdSize)); + retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize); + break; + } + + case NI_BMI1_AndNot: + case NI_BMI1_X64_AndNot: + { + assert(sig->numArgs == 2); + + // The same general reasoning for the decomposition exists here as + // given above for the SIMD AndNot APIs. + + op2 = impPopStack().val; + op1 = impPopStack().val; + + op1 = gtFoldExpr(gtNewOperNode(GT_NOT, retType, op1)); + retNode = gtNewOperNode(GT_AND, retType, op1, op2); + break; + } + case NI_Vector128_AndNot: case NI_Vector256_AndNot: case NI_Vector512_AndNot: { assert(sig->numArgs == 2); - impSpillSideEffect(true, - verCurrentState.esStackDepth - 2 DEBUGARG("Spilling op1 side effects for HWIntrinsic")); + // We don't want to support creating AND_NOT nodes prior to LIR + // as it can break important optimizations. We'll produces this + // in lowering instead so decompose into the individual operations + // on import op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdBinOpNode(GT_AND_NOT, retType, op1, op2, simdBaseJitType, simdSize); + op2 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize)); + retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize); break; } diff --git a/src/coreclr/jit/importervectorization.cpp b/src/coreclr/jit/importervectorization.cpp index 7447c35de2f29..281b4f6ed4ccd 100644 --- a/src/coreclr/jit/importervectorization.cpp +++ b/src/coreclr/jit/importervectorization.cpp @@ -160,20 +160,8 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD( GenTreeVecCon* toLowerVec1 = gtNewVconNode(simdType, toLowerMask); GenTreeVecCon* toLowerVec2 = gtNewVconNode(simdType, (BYTE*)toLowerMask + byteLen - simdSize); -#if defined(TARGET_XARCH) - if (canUseEvexEncoding()) - { - GenTree* control; - - control = gtNewIconNode(static_cast((0xF0 | 0xCC) ^ 0xAA)); // (A | B)) ^ C - xor1 = gtNewSimdTernaryLogicNode(simdType, vec1, toLowerVec1, cnsVec1, control, baseType, simdSize); - } - else -#endif // TARGET_XARCH - { - vec1 = gtNewSimdBinOpNode(GT_OR, simdType, vec1, toLowerVec1, baseType, simdSize); - xor1 = gtNewSimdBinOpNode(GT_XOR, simdType, vec1, cnsVec1, baseType, simdSize); - } + vec1 = gtNewSimdBinOpNode(GT_OR, simdType, vec1, toLowerVec1, baseType, simdSize); + xor1 = gtNewSimdBinOpNode(GT_XOR, simdType, vec1, cnsVec1, baseType, simdSize); vec2 = gtNewSimdBinOpNode(GT_OR, simdType, vec2, toLowerVec2, baseType, simdSize); } @@ -184,22 +172,10 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD( // ((v1 ^ cns1) | (v2 ^ cns2)) == zero -#if defined(TARGET_XARCH) - if (canUseEvexEncoding()) - { - GenTree* control; - - control = gtNewIconNode(static_cast(0xF0 | (0xCC ^ 0xAA))); // A | (B ^ C) - orr = gtNewSimdTernaryLogicNode(simdType, xor1, vec2, cnsVec2, control, baseType, simdSize); - } - else -#endif // TARGET_XARCH - { - GenTree* xor2; + GenTree* xor2; - xor2 = gtNewSimdBinOpNode(GT_XOR, simdType, vec2, cnsVec2, baseType, simdSize); - orr = gtNewSimdBinOpNode(GT_OR, simdType, xor1, xor2, baseType, simdSize); - } + xor2 = gtNewSimdBinOpNode(GT_XOR, simdType, vec2, cnsVec2, baseType, simdSize); + orr = gtNewSimdBinOpNode(GT_OR, simdType, xor1, xor2, baseType, simdSize); // Optimization: use a single load when byteLen equals simdSize. // For code simplicity we always create nodes for two vectors case. diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 3af6eaa4e2fca..b04e97844f1f2 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1233,6 +1233,87 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); + bool isScalar = false; + genTreeOps oper = node->GetOperForHWIntrinsicId(&isScalar); + + switch (oper) + { + case GT_AND: + case GT_OR: + { + // We want to recognize (~op1 & op2) and transform it + // into AdvSimd.AndNot(op2, op1) as well as (op1 & ~op2) + // transforming it into AdvSimd.AndNot(op1, op2) + // + // We want to similarly handle (~op1 | op2) and (op1 | ~op2) + + bool transform = false; + + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); + + if (op2->OperIsHWIntrinsic()) + { + GenTreeHWIntrinsic* op2Intrin = op2->AsHWIntrinsic(); + + bool op2IsScalar = false; + genTreeOps op2Oper = op2Intrin->GetOperForHWIntrinsicId(&op2IsScalar); + + if (op2Oper == GT_NOT) + { + assert(!op2IsScalar); + transform = true; + + op2 = op2Intrin->Op(1); + BlockRange().Remove(op2Intrin); + } + } + + if (!transform && op1->OperIsHWIntrinsic()) + { + GenTreeHWIntrinsic* opIntrin = op1->AsHWIntrinsic(); + + bool op1IsScalar = false; + genTreeOps op1Oper = opIntrin->GetOperForHWIntrinsicId(&op1IsScalar); + + if (op1Oper == GT_NOT) + { + assert(!op1IsScalar); + transform = true; + + op1 = opIntrin->Op(1); + BlockRange().Remove(opIntrin); + + std::swap(op1, op2); + } + } + + if (transform) + { + if (oper == GT_AND) + { + oper = GT_AND_NOT; + intrinsicId = NI_AdvSimd_BitwiseClear; + } + else + { + assert(oper == GT_OR); + oper = GT_NONE; + intrinsicId = NI_AdvSimd_OrNot; + } + + node->ChangeHWIntrinsicId(intrinsicId, op1, op2); + oper = GT_AND_NOT; + } + break; + } + + default: + { + break; + } + } + switch (intrinsicId) { case NI_Vector64_Create: diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 73fbcb7af66f0..d34595399a56f 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1418,120 +1418,217 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) bool isScalar = false; genTreeOps oper = node->GetOperForHWIntrinsicId(&isScalar); - switch (oper) + if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper)) { - case GT_AND: - case GT_OR: - case GT_XOR: - { - if (!comp->canUseEvexEncoding()) - { - break; - } + // These are the control bytes used for TernaryLogic - GenTree* op1 = node->Op(1); - GenTree* op2 = node->Op(2); + const uint8_t A = 0xF0; + const uint8_t B = 0xCC; + const uint8_t C = 0xAA; - LIR::Use use; - if (BlockRange().TryGetUse(node, &use)) + var_types simdType = node->TypeGet(); + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); + GenTree* op3 = nullptr; + + // We want to specially recognize this pattern as GT_NOT + bool isOperNot = (oper == GT_XOR) && op2->IsVectorAllBitsSet(); + bool isV512Supported = false; + + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + GenTree* user = use.User(); + + if (user->OperIsHWIntrinsic()) { - // search for structure like: - /* - /- A - +- B - t1 = binary logical op1 + GenTreeHWIntrinsic* userIntrin = user->AsHWIntrinsic(); + + bool userIsScalar = false; + genTreeOps userOper = userIntrin->GetOperForHWIntrinsicId(&isScalar); - /- C - +- t1 - t2 = binary logical op2 - */ - GenTree* second = use.User(); - if (!second->OperIs(GT_HWINTRINSIC)) + if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(userOper)) { - break; - } + if (isOperNot && (userOper == GT_AND)) + { + // We want to specially handle GT_AND_NOT as its available without EVEX + GenTree* nextNode = node->gtNext; - bool nestedIsScalar = false; - genTreeOps nestedOper = second->AsHWIntrinsic()->GetOperForHWIntrinsicId(&isScalar); + BlockRange().Remove(op2); + BlockRange().Remove(node); - if (nestedOper == GT_NONE) - { - // TODO: We should support cases like CNDSEL - break; - } + // Note that despite its name, the xarch instruction does ~op1 & op2, so + // we need to ensure op1 is the value whose ones complement is computed - if (nestedIsScalar) - { - break; - } + op2 = userIntrin->Op(2); - if ((nestedOper != GT_AND) && (nestedOper != GT_OR) && (nestedOper != GT_XOR)) - { - // TODO: We should support other cases like AND_NOT, NOT, and CNDSEL - break; - } + if (op2 == node) + { + op2 = userIntrin->Op(1); + } - GenTree* op3 = second->AsHWIntrinsic()->Op(1) == node ? second->AsHWIntrinsic()->Op(2) - : second->AsHWIntrinsic()->Op(1); - GenTree* control = comp->gtNewIconNode(node->GetTernaryControlByte(second->AsHWIntrinsic())); - CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); - unsigned simdSize = node->GetSimdSize(); - var_types simdType = Compiler::getSIMDTypeForSize(simdSize); - GenTree* ternaryNode = - comp->gtNewSimdTernaryLogicNode(simdType, op1, op2, op3, control, simdBaseJitType, simdSize); - BlockRange().InsertBefore(second, control, ternaryNode); - LIR::Use finalRes; - if (BlockRange().TryGetUse(second, &finalRes)) - { - finalRes.ReplaceWith(ternaryNode); - } - else - { - ternaryNode->SetUnusedValue(); + NamedIntrinsic intrinsic = + GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(comp, GT_AND_NOT, op1, op2, simdBaseType, + simdSize, false); + userIntrin->ResetHWIntrinsicId(intrinsic, comp, op1, op2); + + return nextNode; + } + + if (comp->compIsEvexOpportunisticallySupported(isV512Supported)) + { + // For everything else we want to lower it to a standard TernaryLogic node + GenTree* nextNode = node->gtNext; + + BlockRange().Remove(node); + op3 = userIntrin->Op(2); + + if (op3 == node) + { + op3 = userIntrin->Op(1); + } + + uint8_t controlByte = 0x00; + + if ((userOper == GT_XOR) && op3->IsVectorAllBitsSet()) + { + // We're being used by what is actually GT_NOT, so we + // need to shift parameters down so that A is unused + + std::swap(op2, op3); + std::swap(op1, op2); + + if (isOperNot) + { + // We have what is actually a double not, so just return op2 + // which is the only actual value now that the parameters + // were shifted around + + assert(op1->IsVectorAllBitsSet()); + assert(op3->IsVectorAllBitsSet()); + + LIR::Use superUse; + if (BlockRange().TryGetUse(user, &superUse)) + { + superUse.ReplaceWith(op2); + } + else + { + op2->SetUnusedValue(); + } + + BlockRange().Remove(op3); + BlockRange().Remove(op1); + BlockRange().Remove(user); + + return nextNode; + } + else + { + // We're now doing NOT(OP(B, C)) + assert(op1->IsVectorAllBitsSet()); + + controlByte = TernaryLogicInfo::GetTernaryControlByte(oper, B, C); + controlByte = static_cast(~controlByte); + } + } + else if (isOperNot) + { + // A is unused, so we just want OP(NOT(B), C) + + assert(op2->IsVectorAllBitsSet()); + std::swap(op1, op2); + + controlByte = static_cast(~B); + controlByte = TernaryLogicInfo::GetTernaryControlByte(userOper, controlByte, C); + } + else + { + // We have OP2(OP1(A, B), C) + controlByte = TernaryLogicInfo::GetTernaryControlByte(oper, A, B); + controlByte = TernaryLogicInfo::GetTernaryControlByte(userOper, controlByte, C); + } + + NamedIntrinsic ternaryLogicId = NI_AVX512F_TernaryLogic; + + if (simdSize != 64) + { + ternaryLogicId = isV512Supported ? NI_AVX512F_VL_TernaryLogic : NI_AVX10v1_TernaryLogic; + } + + GenTree* op4 = comp->gtNewIconNode(controlByte); + BlockRange().InsertBefore(userIntrin, op4); + + userIntrin->ResetHWIntrinsicId(ternaryLogicId, comp, op1, op2, op3, op4); + return nextNode; + } } - GenTree* next = node->gtNext; - BlockRange().Remove(node); - BlockRange().Remove(second); - return next; } - break; } - default: + if (isOperNot && comp->compIsEvexOpportunisticallySupported(isV512Supported)) { - break; - } - } + // Lowering this to TernaryLogic(zero, zero, op1, ~C) is smaller + // and faster than emitting the pcmpeqd; pxor sequence. - if ((oper == GT_XOR) && node->Op(2)->IsVectorAllBitsSet()) - { - bool isV512Supported = false; - if ((genTypeSize(node->GetSimdBaseType()) >= 4) && comp->compIsEvexOpportunisticallySupported(isV512Supported)) - { - var_types simdType = node->TypeGet(); - unsigned simdSize = node->GetSimdSize(); + BlockRange().Remove(op2); - GenTree* op1 = node->Op(1); - BlockRange().Remove(node->Op(2)); + if (op1->OperIsHWIntrinsic()) + { + GenTreeHWIntrinsic* opIntrin = op1->AsHWIntrinsic(); + + if (HWIntrinsicInfo::IsTernaryLogic(opIntrin->GetHWIntrinsicId())) + { + GenTree* opControl = opIntrin->Op(4); + + if (opControl->IsCnsIntOrI()) + { + // When the input is already a ternary logic node, we want to invert it rather + // than introduce a new ternary logic node. + + GenTree* nextNode = node->gtNext; + + GenTreeIntConCommon* opControlCns = opControl->AsIntConCommon(); + opControlCns->SetIconValue(static_cast(~opControlCns->IconValue())); + + if (BlockRange().TryGetUse(node, &use)) + { + use.ReplaceWith(op1); + } + else + { + op1->SetUnusedValue(); + } + + BlockRange().Remove(node); + return nextNode; + } + } + } - // We can't use the mask, but we can emit a ternary logic node NamedIntrinsic ternaryLogicId = NI_AVX512F_TernaryLogic; if (simdSize != 64) { - ternaryLogicId = !isV512Supported ? NI_AVX10v1_TernaryLogic : NI_AVX512F_VL_TernaryLogic; + ternaryLogicId = isV512Supported ? NI_AVX512F_VL_TernaryLogic : NI_AVX10v1_TernaryLogic; } - GenTree* op2 = comp->gtNewZeroConNode(simdType); + op3 = op1; + + op2 = comp->gtNewZeroConNode(simdType); BlockRange().InsertBefore(node, op2); - GenTree* op3 = comp->gtNewZeroConNode(simdType); - BlockRange().InsertBefore(node, op3); + op1 = comp->gtNewZeroConNode(simdType); + BlockRange().InsertBefore(node, op1); - GenTree* control = comp->gtNewIconNode(static_cast(~0xAA)); // ~C + GenTree* control = comp->gtNewIconNode(static_cast(~C)); BlockRange().InsertBefore(node, control); - node->ResetHWIntrinsicId(ternaryLogicId, comp, op3, op2, op1, control); + node->ResetHWIntrinsicId(ternaryLogicId, comp, op1, op2, op3, control); return LowerNode(node); } } @@ -1741,6 +1838,79 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_EVEX_AndMask: + { + // We want to recognize (~op1 & op2) and transform it + // into Evex.AndNotMask(op1, op2) as well as (op1 & ~op2) + // transforming it into Evex.AndNotMask(op2, op1), which + // takes into account that the XARCH APIs operate more like + // NotAnd + + bool transform = false; + + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); + + if (op1->OperIsHWIntrinsic(NI_EVEX_NotMask)) + { + GenTreeHWIntrinsic* opIntrin = op1->AsHWIntrinsic(); + unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); + + if (genTypeSize(opIntrin->GetSimdBaseType()) == simdBaseTypeSize) + { + transform = true; + + op1 = opIntrin->Op(1); + BlockRange().Remove(opIntrin); + } + } + + if (!transform && op2->OperIsHWIntrinsic(NI_EVEX_NotMask)) + { + GenTreeHWIntrinsic* opIntrin = op2->AsHWIntrinsic(); + unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); + + if (genTypeSize(opIntrin->GetSimdBaseType()) == simdBaseTypeSize) + { + transform = true; + + op1 = opIntrin->Op(1); + BlockRange().Remove(opIntrin); + + std::swap(op1, op2); + } + } + + if (transform) + { + intrinsicId = NI_EVEX_AndNotMask; + node->ChangeHWIntrinsicId(intrinsicId, op1, op2); + } + break; + } + + case NI_EVEX_NotMask: + { + // We want to recognize ~(op1 ^ op2) and transform it + // into Evex.XnorMask(op1, op2) + + GenTree* op1 = node->Op(1); + + if (op1->OperIsHWIntrinsic(NI_EVEX_XorMask)) + { + GenTreeHWIntrinsic* opIntrin = op1->AsHWIntrinsic(); + unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); + + if (genTypeSize(opIntrin->GetSimdBaseType()) == simdBaseTypeSize) + { + intrinsicId = NI_EVEX_XnorMask; + node->ResetHWIntrinsicId(intrinsicId, comp, opIntrin->Op(1), opIntrin->Op(2)); + BlockRange().Remove(opIntrin); + } + } + break; + } + case NI_Vector128_ToScalar: case NI_Vector256_ToScalar: case NI_Vector512_ToScalar: @@ -3210,6 +3380,12 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) { assert(comp->canUseEvexEncodingDebugOnly()); + // These are the control bytes used for TernaryLogic + + const uint8_t A = 0xF0; + const uint8_t B = 0xCC; + const uint8_t C = 0xAA; + var_types simdType = node->gtType; CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); var_types simdBaseType = node->GetSimdBaseType(); @@ -3226,66 +3402,66 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) if (op4->IsCnsIntOrI()) { - uint8_t control = static_cast(op4->AsIntConCommon()->IconValue()); + uint8_t control = static_cast(op4->AsIntConCommon()->IconValue()); + const TernaryLogicInfo& info = TernaryLogicInfo::lookup(control); + TernaryLogicUseFlags useFlags = info.GetAllUseFlags(); switch (control) { - case 0xAC: // A ? C : B; (C & A) | (B & ~A) - case 0xB8: // B ? C : A; (C & B) | (A & ~B) - case 0xD8: // C ? B : A; (B & C) | (A & ~C) - case 0xCA: // A ? B : C; (B & A) | (C & ~A) - case 0xE2: // B ? A : C; (A & B) | (C & ~B) - case 0xE4: // C ? A : B; (A & C) | (B & ~C) + case static_cast((C & A) | (B & ~A)): // A ? C : B + case static_cast((C & B) | (A & ~B)): // B ? C : A + case static_cast((B & C) | (A & ~C)): // C ? B : A + case static_cast((B & A) | (C & ~A)): // A ? B : C + case static_cast((A & B) | (C & ~B)): // B ? A : C + case static_cast((A & C) | (B & ~C)): // C ? A : B { // For the operations that work as a conditional select, we want // to try and optimize it to use BlendVariableMask when the condition // is already a TYP_MASK - const TernaryLogicInfo& ternLogInfo = TernaryLogicInfo::lookup(control); - - assert(ternLogInfo.oper1 == TernaryLogicOperKind::Select); - assert(ternLogInfo.oper2 == TernaryLogicOperKind::Select); - assert(ternLogInfo.oper3 == TernaryLogicOperKind::Cond); + assert(info.oper1 == TernaryLogicOperKind::Select); + assert(info.oper2 == TernaryLogicOperKind::Select); + assert(info.oper3 == TernaryLogicOperKind::Cond); GenTree* condition = nullptr; GenTree* selectTrue = nullptr; GenTree* selectFalse = nullptr; - if (ternLogInfo.oper1Use == TernaryLogicUseFlags::A) + if (info.oper1Use == TernaryLogicUseFlags::A) { selectTrue = op1; - if (ternLogInfo.oper2Use == TernaryLogicUseFlags::B) + if (info.oper2Use == TernaryLogicUseFlags::B) { - assert(ternLogInfo.oper3Use == TernaryLogicUseFlags::C); + assert(info.oper3Use == TernaryLogicUseFlags::C); selectFalse = op2; condition = op3; } else { - assert(ternLogInfo.oper2Use == TernaryLogicUseFlags::C); - assert(ternLogInfo.oper3Use == TernaryLogicUseFlags::B); + assert(info.oper2Use == TernaryLogicUseFlags::C); + assert(info.oper3Use == TernaryLogicUseFlags::B); selectFalse = op3; condition = op2; } } - else if (ternLogInfo.oper1Use == TernaryLogicUseFlags::B) + else if (info.oper1Use == TernaryLogicUseFlags::B) { selectTrue = op2; - if (ternLogInfo.oper2Use == TernaryLogicUseFlags::A) + if (info.oper2Use == TernaryLogicUseFlags::A) { - assert(ternLogInfo.oper3Use == TernaryLogicUseFlags::C); + assert(info.oper3Use == TernaryLogicUseFlags::C); selectFalse = op1; condition = op3; } else { - assert(ternLogInfo.oper2Use == TernaryLogicUseFlags::C); - assert(ternLogInfo.oper3Use == TernaryLogicUseFlags::A); + assert(info.oper2Use == TernaryLogicUseFlags::C); + assert(info.oper3Use == TernaryLogicUseFlags::A); selectFalse = op3; condition = op1; @@ -3293,21 +3469,21 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) } else { - assert(ternLogInfo.oper1Use == TernaryLogicUseFlags::C); + assert(info.oper1Use == TernaryLogicUseFlags::C); selectTrue = op3; - if (ternLogInfo.oper2Use == TernaryLogicUseFlags::A) + if (info.oper2Use == TernaryLogicUseFlags::A) { - assert(ternLogInfo.oper3Use == TernaryLogicUseFlags::B); + assert(info.oper3Use == TernaryLogicUseFlags::B); selectFalse = op1; condition = op2; } else { - assert(ternLogInfo.oper2Use == TernaryLogicUseFlags::B); - assert(ternLogInfo.oper3Use == TernaryLogicUseFlags::A); + assert(info.oper2Use == TernaryLogicUseFlags::B); + assert(info.oper3Use == TernaryLogicUseFlags::A); selectFalse = op2; condition = op1; @@ -3465,11 +3641,173 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) default: { + switch (useFlags) + { + case TernaryLogicUseFlags::A: + { + // Swap the operands here to make the containment checks in codegen significantly simpler + std::swap(node->Op(1), node->Op(3)); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(info, C, B, A); + op4->AsIntCon()->SetIconValue(control); + + useFlags = TernaryLogicUseFlags::C; + break; + } + + case TernaryLogicUseFlags::B: + { + // Swap the operands here to make the containment checks in codegen significantly simpler + std::swap(node->Op(2), node->Op(3)); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(info, A, C, B); + op4->AsIntCon()->SetIconValue(control); + + useFlags = TernaryLogicUseFlags::C; + break; + } + + case TernaryLogicUseFlags::AB: + { + // Swap the operands here to make the containment checks in codegen significantly simpler + std::swap(node->Op(2), node->Op(3)); + std::swap(node->Op(1), node->Op(2)); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(info, C, A, B); + op4->AsIntCon()->SetIconValue(control); + + useFlags = TernaryLogicUseFlags::BC; + break; + } + + case TernaryLogicUseFlags::AC: + { + // Swap the operands here to make the containment checks in codegen significantly simpler + std::swap(node->Op(1), node->Op(2)); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(info, B, A, C); + op4->AsIntCon()->SetIconValue(control); + + useFlags = TernaryLogicUseFlags::BC; + break; + } + + default: + { + break; + } + } + + GenTree* replacementNode = nullptr; + + switch (useFlags) + { + case TernaryLogicUseFlags::None: + { + // Encountering none should be very rare and so we'll handle + // it, but we won't try to optimize it by finding an existing + // constant to reuse or similar, as that's more expensive + + op1->SetUnusedValue(); + op2->SetUnusedValue(); + op3->SetUnusedValue(); + + if (control == 0x00) + { + replacementNode = comp->gtNewZeroConNode(simdType); + } + else + { + assert(control == 0xFF); + replacementNode = comp->gtNewAllBitsSetConNode(simdType); + } + + BlockRange().InsertBefore(node, replacementNode); + break; + } + + case TernaryLogicUseFlags::C: + { + // Encountering `select(c)` instead of `not(c)` should likewise + // be rare, but we'll handle it in case the combined operations + // are just right to cause it to appear + + if (control == C) + { + op1->SetUnusedValue(); + op2->SetUnusedValue(); + + replacementNode = op3; + break; + } + + // For not, we do want to check if we already have reusable constants as + // this can occur for the normal lowering pattern around `xor(c, AllBitsSet)` + + if (!op1->IsCnsVec()) + { + op1->SetUnusedValue(); + op1 = comp->gtNewZeroConNode(simdType); + + BlockRange().InsertBefore(node, op1); + node->Op(1) = op1; + } + + if (!op2->IsCnsVec()) + { + op2->SetUnusedValue(); + op2 = comp->gtNewZeroConNode(simdType); + + BlockRange().InsertBefore(node, op2); + node->Op(2) = op2; + } + break; + } + + case TernaryLogicUseFlags::BC: + { + if (!op1->IsCnsVec()) + { + op1->SetUnusedValue(); + op1 = comp->gtNewZeroConNode(simdType); + + BlockRange().InsertBefore(node, op1); + node->Op(1) = op1; + } + break; + } + + default: + { + assert(useFlags == TernaryLogicUseFlags::ABC); + break; + } + } + + if (replacementNode != nullptr) + { + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + use.ReplaceWith(replacementNode); + } + else + { + replacementNode->SetUnusedValue(); + } + } break; } } } + // TODO-XARCH-AVX512: We should look for nested TernaryLogic and BitwiseOper + // nodes so that we can fully take advantage of the instruction where possible + ContainCheckHWIntrinsic(node); return node->gtNext; } @@ -7246,6 +7584,11 @@ GenTree* Lowering::PreferredRegOptionalOperand(GenTree* op1, GenTree* op2) // f) If neither of them are local vars (i.e. tree temps), prefer to // mark op1 as reg optional for the same reason as mentioned in (d) above. + if (op1 == nullptr) + { + return op2; + } + assert(!op1->IsRegOptional()); assert(!op2->IsRegOptional()); @@ -9290,7 +9633,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre assert(childBaseType == TYP_DOUBLE); } - if (comp->canUseEvexEncoding() && parentNode->OperIsEmbBroadcastCompatible()) + if (parentNode->OperIsEmbBroadcastCompatible() && comp->canUseEvexEncoding()) { GenTree* broadcastOperand = hwintrinsic->Op(1); @@ -9328,7 +9671,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre return false; } - return parentNode->OperIsEmbBroadcastCompatible(); + return parentNode->OperIsEmbBroadcastCompatible() && comp->canUseEvexEncoding(); } default: @@ -9891,59 +10234,60 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) bool supportsOp1RegOptional = false; bool supportsOp2RegOptional = false; + GenTree* containedOperand = nullptr; + GenTree* regOptionalOperand = nullptr; + bool swapOperands = false; + if (IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) { - if (op2->IsCnsVec() && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && - node->OperIsEmbBroadcastCompatible()) - { - TryFoldCnsVecForEmbeddedBroadcast(node, op2->AsVecCon()); - } - else - { - MakeSrcContained(node, op2); - } + containedOperand = op2; } else if ((isCommutative || (intrinsicId == NI_BMI2_MultiplyNoFlags) || (intrinsicId == NI_BMI2_X64_MultiplyNoFlags)) && IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) { - if (op1->IsCnsVec() && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && - node->OperIsEmbBroadcastCompatible()) + containedOperand = op1; + swapOperands = true; + } + else + { + if (supportsOp1RegOptional) { - TryFoldCnsVecForEmbeddedBroadcast(node, op1->AsVecCon()); + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op1); } - else + + if (supportsOp2RegOptional) { - MakeSrcContained(node, op1); + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op2); } - // Swap the operands here to make the containment checks in codegen significantly simpler - std::swap(node->Op(1), node->Op(2)); + if (regOptionalOperand == op1) + { + swapOperands = true; + } } - else if (supportsOp1RegOptional) + + if (containedOperand != nullptr) { - if (supportsOp2RegOptional) + if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible() && + comp->canUseEvexEncoding()) { - GenTree* regOptionalOperand = PreferredRegOptionalOperand(op1, op2); - MakeSrcRegOptional(node, regOptionalOperand); - - if (regOptionalOperand == op1) - { - // Swap the operands here to make the containment checks in codegen simpler - std::swap(node->Op(1), node->Op(2)); - } + TryFoldCnsVecForEmbeddedBroadcast(node, containedOperand->AsVecCon()); } else { - MakeSrcRegOptional(node, op1); - - // Swap the operands here to make the containment checks in codegen simpler - std::swap(node->Op(1), node->Op(2)); + MakeSrcContained(node, containedOperand); } } - else if (supportsOp2RegOptional) + else if (regOptionalOperand != nullptr) { - MakeSrcRegOptional(node, op2); + MakeSrcRegOptional(node, regOptionalOperand); + } + + if (swapOperands) + { + // Swap the operands here to make the containment checks in codegen significantly simpler + std::swap(node->Op(1), node->Op(2)); } break; } @@ -10268,6 +10612,9 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) bool supportsOp2RegOptional = false; bool supportsOp3RegOptional = false; + GenTree* containedOperand = nullptr; + GenTree* regOptionalOperand = nullptr; + LIR::Use use; GenTree* user = nullptr; @@ -10284,61 +10631,58 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if ((resultOpNum != 3) && IsContainableHWIntrinsicOp(node, op3, &supportsOp3RegOptional)) { // result = (op1 * op2) + [op3] - MakeSrcContained(node, op3); + containedOperand = op3; } else if ((resultOpNum != 2) && IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) { // result = (op1 * [op2]) + op3 - MakeSrcContained(node, op2); + containedOperand = op2; } else if ((resultOpNum != 1) && !HWIntrinsicInfo::CopiesUpperBits(intrinsicId) && IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) { // result = ([op1] * op2) + op3 - MakeSrcContained(node, op1); + containedOperand = op1; } - else if (supportsOp1RegOptional) + else { - if (supportsOp2RegOptional) + if (supportsOp1RegOptional) { - GenTree* regOptionalOperand = PreferredRegOptionalOperand(op1, op2); - - if (supportsOp3RegOptional) - { - regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op3); - } - - MakeSrcRegOptional(node, regOptionalOperand); + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op1); } - else if (supportsOp3RegOptional) + + if (supportsOp2RegOptional) { - GenTree* regOptionalOperand = PreferredRegOptionalOperand(op1, op3); - MakeSrcRegOptional(node, regOptionalOperand); + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op2); } - else + + if (supportsOp3RegOptional) { - MakeSrcRegOptional(node, op1); + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op3); } } - else if (supportsOp2RegOptional) + + if (containedOperand != nullptr) { - if (supportsOp3RegOptional) + if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible() && + comp->canUseEvexEncoding()) { - GenTree* regOptionalOperand = PreferredRegOptionalOperand(op2, op3); - MakeSrcRegOptional(node, regOptionalOperand); + TryFoldCnsVecForEmbeddedBroadcast(node, containedOperand->AsVecCon()); } else { - MakeSrcRegOptional(node, op2); + MakeSrcContained(node, containedOperand); } } - else if (supportsOp3RegOptional) + else if (regOptionalOperand != nullptr) { - MakeSrcRegOptional(node, op3); + MakeSrcRegOptional(node, regOptionalOperand); } } else if (HWIntrinsicInfo::IsPermuteVar2x(intrinsicId)) { + assert(comp->canUseEvexEncodingDebugOnly()); + // PermuteVar2x is similarly special in that op1 and op3 // are commutative and op1 or op2 can be the RMW operand. // @@ -10354,8 +10698,11 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) bool supportsOp1RegOptional = false; bool supportsOp3RegOptional = false; - bool swapOperands = false; - bool isOp2Cns = op2->IsCnsVec(); + + GenTree* containedOperand = nullptr; + GenTree* regOptionalOperand = nullptr; + bool swapOperands = false; + bool isOp2Cns = op2->IsCnsVec(); LIR::Use use; GenTree* user = nullptr; @@ -10373,44 +10720,51 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (((resultOpNum != 3) || !isOp2Cns) && IsContainableHWIntrinsicOp(node, op3, &supportsOp3RegOptional)) { - MakeSrcContained(node, op3); + containedOperand = op3; } else if ((resultOpNum != 2) && isOp2Cns && IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) { - MakeSrcContained(node, op1); - - // Swap the operands here to make the containment checks in codegen significantly simpler - swapOperands = true; + containedOperand = op1; + swapOperands = true; } - else if (supportsOp1RegOptional) + else { + if (supportsOp1RegOptional) + { + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op1); + } + if (supportsOp3RegOptional) { - GenTree* regOptionalOperand = PreferredRegOptionalOperand(op1, op3); - MakeSrcRegOptional(node, regOptionalOperand); + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op3); + } - if (regOptionalOperand == op1) - { - // Swap the operands here to make the containment checks in codegen simpler - swapOperands = true; - } + if (regOptionalOperand == op1) + { + swapOperands = true; + } + } + + if (containedOperand != nullptr) + { + if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible()) + { + TryFoldCnsVecForEmbeddedBroadcast(node, containedOperand->AsVecCon()); } else { - MakeSrcRegOptional(node, op1); - - // Swap the operands here to make the containment checks in codegen simpler - swapOperands = true; + MakeSrcContained(node, containedOperand); } } - else if (supportsOp3RegOptional) + else if (regOptionalOperand != nullptr) { - MakeSrcRegOptional(node, op3); + MakeSrcRegOptional(node, regOptionalOperand); } if (swapOperands) { + // Swap the operands here to make the containment checks in codegen significantly simpler assert(op2->IsCnsVec()); std::swap(node->Op(1), node->Op(3)); @@ -10640,41 +10994,51 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) bool supportsOp1RegOptional = false; bool supportsOp2RegOptional = false; + GenTree* containedOperand = nullptr; + GenTree* regOptionalOperand = nullptr; + bool swapOperands = false; + if (IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) { - MakeSrcContained(node, op2); + containedOperand = op2; } else if (IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) { - MakeSrcContained(node, op1); - - // Swap the operands here to make the containment checks in codegen simpler - std::swap(node->Op(1), node->Op(2)); + containedOperand = op1; + swapOperands = true; } - else if (supportsOp1RegOptional) + else { - if (supportsOp2RegOptional) + if (supportsOp1RegOptional) { - GenTree* regOptionalOperand = PreferredRegOptionalOperand(op1, op2); - MakeSrcRegOptional(node, regOptionalOperand); - - if (regOptionalOperand == op1) - { - // Swap the operands here to make the containment checks in codegen simpler - std::swap(node->Op(1), node->Op(2)); - } + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op1); } - else + + if (supportsOp2RegOptional) { - MakeSrcRegOptional(node, op1); + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op2); + } - // Swap the operands here to make the containment checks in codegen simpler - std::swap(node->Op(1), node->Op(2)); + if (regOptionalOperand == op1) + { + swapOperands = true; } } - else if (supportsOp2RegOptional) + + if (containedOperand != nullptr) { - MakeSrcRegOptional(node, op2); + MakeSrcContained(node, containedOperand); + } + else if (regOptionalOperand != nullptr) + { + MakeSrcRegOptional(node, regOptionalOperand); + } + + if (swapOperands) + { + // Swap the operands here to make the containment checks in codegen significantly + // simpler + std::swap(node->Op(1), node->Op(2)); } break; } @@ -10932,45 +11296,241 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX512F_VL_TernaryLogic: case NI_AVX10v1_TernaryLogic: { + assert(comp->canUseEvexEncodingDebugOnly()); + + // These are the control bytes used for TernaryLogic + + const uint8_t A = 0xF0; + const uint8_t B = 0xCC; + const uint8_t C = 0xAA; + if (!isContainedImm) { // Don't contain if we're generating a jmp table fallback break; } - if (IsContainableHWIntrinsicOp(node, op3, &supportsRegOptional)) - { - MakeSrcContained(node, op3); - } - else if (supportsRegOptional) - { - MakeSrcRegOptional(node, op3); - } - uint8_t control = static_cast(op4->AsIntCon()->gtIconVal); - const TernaryLogicInfo& info = TernaryLogicInfo::lookup(control); - TernaryLogicUseFlags useFlags = info.GetAllUseFlags(); + const TernaryLogicInfo* info = &TernaryLogicInfo::lookup(control); + TernaryLogicUseFlags useFlags = info->GetAllUseFlags(); - if (useFlags != TernaryLogicUseFlags::ABC) - { - assert(!node->isRMWHWIntrinsic(comp)); + bool supportsOp1RegOptional = false; + bool supportsOp2RegOptional = false; + bool supportsOp3RegOptional = false; - // op1, and possibly op2, are never selected - // by the table so we can contain and ignore - // any register allocated to it resulting in - // better non-RMW based codegen. + GenTree* containedOperand = nullptr; + GenTree* regOptionalOperand = nullptr; + TernaryLogicUseFlags swapOperands = TernaryLogicUseFlags::None; - MakeSrcContained(node, op1); + switch (useFlags) + { + case TernaryLogicUseFlags::None: + { + break; + } - if (useFlags == TernaryLogicUseFlags::C) + case TernaryLogicUseFlags::C: { + // We're only using op3, so that's the one to try and contain + + assert(op1->IsCnsVec()); + MakeSrcContained(node, op1); + + assert(op2->IsCnsVec()); MakeSrcContained(node, op2); + + if (IsContainableHWIntrinsicOp(node, op3, &supportsOp3RegOptional)) + { + containedOperand = op3; + } + else if (supportsOp3RegOptional) + { + regOptionalOperand = op3; + } + break; + } + + case TernaryLogicUseFlags::BC: + { + // We're only using op2 and op3, so find the right one to contain + // using the standard commutative rules, fixing up the control byte + // as needed to ensure the operation remains the same + + assert(op1->IsCnsVec()); + MakeSrcContained(node, op1); + + if (IsContainableHWIntrinsicOp(node, op3, &supportsOp3RegOptional)) + { + containedOperand = op3; + } + else if (IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) + { + containedOperand = op2; + swapOperands = TernaryLogicUseFlags::BC; + } + else + { + if (supportsOp2RegOptional) + { + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op2); + } + + if (supportsOp3RegOptional) + { + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op3); + } + + if (regOptionalOperand == op2) + { + swapOperands = TernaryLogicUseFlags::BC; + } + } + break; + } + + case TernaryLogicUseFlags::ABC: + { + // TernaryLogic is special in that any operand can be contained + // and any other operand can be the RMW operand. + // + // This comes about from having a control byte that indicates + // the operation to be performed per operand. + + LIR::Use use; + GenTree* user = nullptr; + + if (BlockRange().TryGetUse(node, &use)) + { + user = use.User(); + } + unsigned resultOpNum = node->GetResultOpNumForRmwIntrinsic(user, op1, op2, op3); + + // Prioritize Containable op. Check if any one of the op is containable first. + // Set op regOptional only if none of them is containable. + + if (resultOpNum == 2) + { + // Swap the operands here to make the containment checks in codegen + // significantly simpler + std::swap(node->Op(1), node->Op(2)); + std::swap(op1, op2); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(*info, B, A, C); + op4->AsIntCon()->SetIconValue(control); + + // Result is now in op1, but also get the updated info + resultOpNum = 1; + info = &TernaryLogicInfo::lookup(control); + } + else if (resultOpNum == 3) + { + // Swap the operands here to make the containment checks in codegen + // significantly simpler + std::swap(node->Op(1), node->Op(3)); + std::swap(op1, op3); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(*info, C, B, A); + op4->AsIntCon()->SetIconValue(control); + + // Result is now in op1, but also get the updated info + resultOpNum = 1; + info = &TernaryLogicInfo::lookup(control); + } + + // Prefer to make op3 contained as it doesn't require reordering operands + if (IsContainableHWIntrinsicOp(node, op3, &supportsOp3RegOptional)) + { + containedOperand = op3; + } + else if (IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) + { + containedOperand = op2; + swapOperands = TernaryLogicUseFlags::BC; + } + else if ((resultOpNum != 1) && + IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) + { + containedOperand = op1; + swapOperands = TernaryLogicUseFlags::AC; + } + else + { + if (supportsOp1RegOptional) + { + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op1); + } + + if (supportsOp2RegOptional) + { + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op2); + } + + if (supportsOp3RegOptional) + { + regOptionalOperand = PreferredRegOptionalOperand(regOptionalOperand, op3); + } + + if (regOptionalOperand == op1) + { + swapOperands = TernaryLogicUseFlags::AC; + } + else if (regOptionalOperand == op2) + { + swapOperands = TernaryLogicUseFlags::BC; + } + } + break; + } + + default: + { + // Lowering should have normalized to one of the above + unreached(); + } + } + + if (containedOperand != nullptr) + { + if (containedOperand->IsCnsVec() && node->OperIsEmbBroadcastCompatible()) + { + TryFoldCnsVecForEmbeddedBroadcast(node, containedOperand->AsVecCon()); } else { - assert(useFlags == TernaryLogicUseFlags::BC); + MakeSrcContained(node, containedOperand); } } + else if (regOptionalOperand != nullptr) + { + MakeSrcRegOptional(node, regOptionalOperand); + } + + if (swapOperands == TernaryLogicUseFlags::AC) + { + // Swap the operands here to make the containment checks in codegen + // significantly simpler + std::swap(node->Op(1), node->Op(3)); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(*info, C, B, A); + op4->AsIntCon()->SetIconValue(control); + } + else if (swapOperands == TernaryLogicUseFlags::BC) + { + // Swap the operands here to make the containment checks in codegen + // significantly simpler + std::swap(node->Op(2), node->Op(3)); + + // Make sure we also fixup the control byte + control = TernaryLogicInfo::GetTernaryControlByte(*info, A, C, B); + op4->AsIntCon()->SetIconValue(control); + } + else + { + assert(swapOperands == TernaryLogicUseFlags::None); + } break; } diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 7df5b471fb7a9..d44ace570f921 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -9921,6 +9921,9 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) genTreeOps actualOper = node->GetOperForHWIntrinsicId(&isScalar); genTreeOps oper = actualOper; + // We shouldn't find AND_NOT nodes since it should only be produced in lowering + assert(oper != GT_AND_NOT); + if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper)) { GenTree* op1 = node->Op(1); @@ -9994,12 +9997,6 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) break; } - case GT_AND_NOT: - { - maskIntrinsicId = NI_EVEX_AndNotMask; - break; - } - case GT_NOT: { maskIntrinsicId = NI_EVEX_NotMask; @@ -10079,91 +10076,6 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) switch (oper) { - // Transforms: - // 1. (~v1 & v2) to VectorXxx.AndNot(v2, v1) - // 2. (v1 & ~v2) to VectorXxx.AndNot(v1, v2) - case GT_AND: - { - GenTree* op1 = node->Op(1); - GenTree* op2 = node->Op(2); - GenTree* lhs = nullptr; - GenTree* rhs = nullptr; - - if (op1->OperIsHWIntrinsic()) - { - // Try handle: ~op1 & op2 - GenTreeHWIntrinsic* hw = op1->AsHWIntrinsic(); - genTreeOps hwOper = hw->GetOperForHWIntrinsicId(&isScalar); - - if (isScalar) - { - return node; - } - -#if defined(TARGET_ARM64) - if (hwOper == GT_NOT) - { - lhs = op2; - rhs = hw->Op(1); - } -#elif defined(TARGET_XARCH) - if ((hwOper == GT_XOR) && hw->Op(2)->IsVectorAllBitsSet()) - { - lhs = op2; - rhs = hw->Op(1); - } -#endif // !TARGET_ARM64 && !TARGET_XARCH - } - - if ((lhs == nullptr) && op2->OperIsHWIntrinsic()) - { - // Try handle: op1 & ~op2 - GenTreeHWIntrinsic* hw = op2->AsHWIntrinsic(); - genTreeOps hwOper = hw->GetOperForHWIntrinsicId(&isScalar); - - if (isScalar) - { - return node; - } - -#if defined(TARGET_ARM64) - if (hwOper == GT_NOT) - { - lhs = op1; - rhs = hw->Op(1); - } -#elif defined(TARGET_XARCH) - if ((hwOper == GT_XOR) && hw->Op(2)->IsVectorAllBitsSet()) - { - lhs = op1; - rhs = hw->Op(1); - } -#endif // !TARGET_ARM64 && !TARGET_XARCH - } - - if (lhs == nullptr) - { - break; - } - assert(rhs != nullptr); - - // Filter out side effecting cases for several reasons: - // 1. gtNewSimdBinOpNode may swap operand order. - // 2. The code above will swap operand order. - // 3. The code above does not handle GTF_REVERSE_OPS. - if (((lhs->gtFlags | rhs->gtFlags) & GTF_ALL_EFFECT) != 0) - { - break; - } - - GenTree* andnNode = gtNewSimdBinOpNode(GT_AND_NOT, retType, lhs, rhs, simdBaseJitType, simdSize); - - DEBUG_DESTROY_NODE(node); - INDEBUG(andnNode->gtDebugFlags |= GTF_DEBUG_NODE_MORPHED); - - return andnNode; - } - #if defined(TARGET_ARM64) // Transforms: // 1. -(-v1) to v1 diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 09ea63f5e0ae4..1b795678e7b00 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -7919,13 +7919,11 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree, if (oper != GT_NONE) { + // We shouldn't find AND_NOT nodes since it should only be produced in lowering + assert(oper != GT_AND_NOT); + #if defined(TARGET_XARCH) - if (oper == GT_AND_NOT) - { - // xarch does: ~arg0VN & arg1VN - std::swap(arg0VN, arg1VN); - } - else if ((oper == GT_LSH) || (oper == GT_RSH) || (oper == GT_RSZ)) + if ((oper == GT_LSH) || (oper == GT_RSH) || (oper == GT_RSZ)) { if (TypeOfVN(arg1VN) == TYP_SIMD16) { @@ -8047,6 +8045,9 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree, bool isScalar = false; genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar); + // We shouldn't find AND_NOT nodes since it should only be produced in lowering + assert(oper != GT_AND_NOT); + if (isScalar) { // We don't support folding scalars today @@ -8108,37 +8109,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree, break; } - case GT_AND_NOT: - { -#if defined(TARGET_XARCH) - std::swap(arg0VN, arg1VN); -#endif // TARGET_XARCH - - // Handle `x & ~0 == x` and `0 & ~x == 0` - ValueNum zeroVN = VNZeroForType(type); - - if (cnsVN == zeroVN) - { - if (cnsVN == arg0VN) - { - return zeroVN; - } - return argVN; - } - - // Handle `x & ~AllBitsSet == 0` - ValueNum allBitsVN = VNAllBitsForType(type); - - if (cnsVN == allBitsVN) - { - if (cnsVN == arg1VN) - { - return zeroVN; - } - } - break; - } - case GT_DIV: { if (varTypeIsFloating(baseType)) @@ -8397,6 +8367,9 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree, bool isScalar = false; genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar); + // We shouldn't find AND_NOT nodes since it should only be produced in lowering + assert(oper != GT_AND_NOT); + if (isScalar) { // We don't support folding scalars today @@ -8411,12 +8384,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree, return arg0VN; } - case GT_AND_NOT: - { - // Handle `x & ~x == 0` - return VNZeroForType(type); - } - case GT_OR: { // Handle `x | x == x` @@ -8575,12 +8542,12 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunTernary(GenTreeHWIntrinsic* tree, switch (ni) { - case NI_Vector128_ConditionalSelect: #if defined(TARGET_XARCH) + case NI_Vector128_ConditionalSelect: case NI_Vector256_ConditionalSelect: case NI_Vector512_ConditionalSelect: #elif defined(TARGET_ARM64) - case NI_Vector64_ConditionalSelect: + case NI_AdvSimd_BitwiseSelect: case NI_Sve_ConditionalSelect: #endif {