Skip to content

Commit

Permalink
Decompose some bitwise operations in HIR to allow more overall optimi…
Browse files Browse the repository at this point in the history
…zations to kick in (#104517)

* Decompose some bitwise operations in HIR to allow more overall optimizations to kick in

* Ensure that we actually remove the underlying op

* Ensure the AND_NOT decomposition is still folded during import for minopts

* Ensure we propagate AllBitsSet into simd GT_XOR on xarch

* Ensure that we prefer AndNot over TernaryLogic

* Cleanup the TernaryLogic lowering code

* Ensure that TernaryLogic picks the best operand for containment

* Ensure we swap the operands that are being checked for containment

* Ensure that TernaryLogic is simplified where possible

* Apply formatting patch
  • Loading branch information
tannergooding authored Jul 13, 2024
1 parent 5336e18 commit 6d3cb53
Show file tree
Hide file tree
Showing 14 changed files with 1,469 additions and 566 deletions.
205 changes: 63 additions & 142 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20506,10 +20506,17 @@ GenTree* Compiler::gtNewSimdAbsNode(var_types type, GenTree* op1, CorInfoType si

GenTree* bitMask;

bitMask = gtNewDconNode(-0.0, simdBaseType);
bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, simdBaseJitType, simdSize);

return gtNewSimdBinOpNode(GT_AND_NOT, type, op1, bitMask, simdBaseJitType, simdSize);
if (simdBaseType == TYP_FLOAT)
{
bitMask = gtNewIconNode(0x7FFFFFFF);
bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_INT, simdSize);
}
else
{
bitMask = gtNewLconNode(0x7FFFFFFFFFFFFFFF);
bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_LONG, simdSize);
}
return gtNewSimdBinOpNode(GT_AND, type, op1, bitMask, simdBaseJitType, simdSize);
}

NamedIntrinsic intrinsic = NI_Illegal;
Expand Down Expand Up @@ -20750,12 +20757,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
}
}
}

if (op == GT_AND_NOT)
{
// GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2`
needsReverseOps = true;
}
break;
}
#endif // TARGET_XARCH
Expand Down Expand Up @@ -20786,11 +20787,34 @@ GenTree* Compiler::gtNewSimdBinOpNode(

if (intrinsic != NI_Illegal)
{
if (op == GT_AND_NOT)
{
assert(fgNodeThreading == NodeThreading::LIR);

#if defined(TARGET_XARCH)
// GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2`
// We specially handle this here since we're only producing a
// native intrinsic node in LIR

std::swap(op1, op2);
#endif // TARGET_XARCH
}
return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
}

switch (op)
{
case GT_AND_NOT:
{
// Prior to LIR, we want to explicitly decompose this operation so that downstream phases can
// appropriately optimize around the individual operations being performed, particularly ~op2,
// and produce overall better codegen.
assert(fgNodeThreading != NodeThreading::LIR);

op2 = gtNewSimdUnOpNode(GT_NOT, type, op2, simdBaseJitType, simdSize);
return gtNewSimdBinOpNode(GT_AND, type, op1, op2, simdBaseJitType, simdSize);
}

#if defined(TARGET_XARCH)
case GT_RSZ:
{
Expand Down Expand Up @@ -20955,9 +20979,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
vecCon1->gtSimdVal.u64[i] = 0x00FF00FF00FF00FF;
}

// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());

// Vector256<short> maskedProduct = Avx2.And(widenedProduct, vecCon1).AsInt16()
GenTree* maskedProduct = gtNewSimdBinOpNode(GT_AND, widenedType, widenedProduct, vecCon1,
widenedSimdBaseJitType, widenedSimdSize);
Expand Down Expand Up @@ -21922,9 +21943,6 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
v = gtNewSimdHWIntrinsicNode(type, v, gtNewIconNode(SHUFFLE_ZZXX, TYP_INT), NI_SSE2_Shuffle,
CORINFO_TYPE_INT, simdSize);

// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());

op2 = gtNewSimdBinOpNode(GT_AND, type, u, v, simdBaseJitType, simdSize);
return gtNewSimdBinOpNode(GT_OR, type, op1, op2, simdBaseJitType, simdSize);
}
Expand Down Expand Up @@ -24315,9 +24333,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(

GenTree* vecCon2 = gtCloneExpr(vecCon1);

// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());

tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE,
Expand Down Expand Up @@ -24356,9 +24371,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(

GenTree* vecCon2 = gtCloneExpr(vecCon1);

// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());

tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_USHORT,
Expand Down Expand Up @@ -24460,9 +24472,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(

GenTree* vecCon2 = gtCloneExpr(vecCon1);

// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());

tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);

Expand Down Expand Up @@ -24499,9 +24508,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(

GenTree* vecCon2 = gtCloneExpr(vecCon1);

// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());

tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);

Expand Down Expand Up @@ -28120,6 +28126,14 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp,
assert(!isScalar);
assert(op2->TypeIs(simdType));

if (comp->fgNodeThreading != NodeThreading::LIR)
{
// We don't want to support creating AND_NOT nodes prior to LIR
// as it can break important optimizations. We'll produces this
// in lowering instead.
break;
}

#if defined(TARGET_XARCH)
if (simdSize == 64)
{
Expand Down Expand Up @@ -29187,6 +29201,21 @@ bool GenTreeHWIntrinsic::ShouldConstantProp(GenTree* operand, GenTreeVecCon* vec
return IsUserCall() && (operand == Op(2));
}

#if defined(TARGET_XARCH)
case NI_SSE_Xor:
case NI_SSE2_Xor:
case NI_AVX_Xor:
case NI_AVX2_Xor:
case NI_AVX512F_Xor:
case NI_AVX512DQ_Xor:
case NI_AVX10v1_V512_Xor:
{
// We recognize this as GT_NOT which can enable other optimizations
assert(GetOperandCount() == 2);
return vecCon->IsVectorAllBitsSet();
}
#endif // TARGET_XARCH

default:
{
break;
Expand Down Expand Up @@ -29936,7 +29965,8 @@ bool GenTreeLclVar::IsNeverNegative(Compiler* comp) const
unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3)
{
#if defined(TARGET_XARCH)
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId));
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId) ||
HWIntrinsicInfo::IsTernaryLogic(gtHWIntrinsicId));
#elif defined(TARGET_ARM64)
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId));
#endif
Expand Down Expand Up @@ -29980,85 +30010,6 @@ unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree

return 0;
}

//------------------------------------------------------------------------
// GetTernaryControlByte: calculate the value of the control byte for ternary node
// with given logic nodes on the input.
//
// Return value: the value of the ternary control byte.
uint8_t GenTreeHWIntrinsic::GetTernaryControlByte(GenTreeHWIntrinsic* second) const
{
// we assume we have a structure like:
/*
/- A
+- B
t1 = binary logical op1

/- C
+- t1
t2 = binary logical op2
*/

// To calculate the control byte value:
// The way the constants work is we have three keys:
// * A: 0xF0
// * B: 0xCC
// * C: 0xAA
//
// To compute the correct control byte, you simply perform the corresponding operation on these keys. So, if you
// wanted to do (A & B) ^ C, you would compute (0xF0 & 0xCC) ^ 0xAA or 0x6A.
assert(second->Op(1) == this || second->Op(2) == this);
const uint8_t A = 0xF0;
const uint8_t B = 0xCC;
const uint8_t C = 0xAA;

bool isScalar = false;

genTreeOps firstOper = GetOperForHWIntrinsicId(&isScalar);
assert(!isScalar);

genTreeOps secondOper = second->GetOperForHWIntrinsicId(&isScalar);
assert(!isScalar);

uint8_t AB = 0;
uint8_t ABC = 0;

if (firstOper == GT_AND)
{
AB = A & B;
}
else if (firstOper == GT_OR)
{
AB = A | B;
}
else if (firstOper == GT_XOR)
{
AB = A ^ B;
}
else
{
unreached();
}

if (secondOper == GT_AND)
{
ABC = AB & C;
}
else if (secondOper == GT_OR)
{
ABC = AB | C;
}
else if (secondOper == GT_XOR)
{
ABC = AB ^ C;
}
else
{
unreached();
}

return ABC;
}
#endif // TARGET_XARCH && FEATURE_HW_INTRINSICS

unsigned GenTreeLclFld::GetSize() const
Expand Down Expand Up @@ -30454,13 +30405,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
bool isScalar = false;
genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar);

#if defined(TARGET_XARCH)
if (oper == GT_AND_NOT)
{
// xarch does: ~op1 & op2, we need op1 & ~op2
std::swap(op1, op2);
}
#endif // TARGET_XARCH
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
assert(oper != GT_AND_NOT);

GenTree* cnsNode = nullptr;
GenTree* otherNode = nullptr;
Expand Down Expand Up @@ -30973,31 +30919,6 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
break;
}

case GT_AND_NOT:
{
// Handle `x & ~0 == x` and `0 & ~x == 0`
if (cnsNode->IsVectorZero())
{
if (cnsNode == op1)
{
resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
break;
}
else
{
resultNode = otherNode;
}
break;
}

// Handle `x & ~AllBitsSet == 0`
if (cnsNode->IsVectorAllBitsSet() && (cnsNode == op2))
{
resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
}
break;
}

case GT_DIV:
{
if (varTypeIsFloating(simdBaseType))
Expand Down Expand Up @@ -31388,12 +31309,12 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
{
switch (ni)
{
case NI_Vector128_ConditionalSelect:
#if defined(TARGET_XARCH)
case NI_Vector128_ConditionalSelect:
case NI_Vector256_ConditionalSelect:
case NI_Vector512_ConditionalSelect:
#elif defined(TARGET_ARM64)
case NI_Vector64_ConditionalSelect:
case NI_AdvSimd_BitwiseSelect:
case NI_Sve_ConditionalSelect:
#endif
{
Expand Down
1 change: 0 additions & 1 deletion src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -6527,7 +6527,6 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic
bool OperRequiresGlobRefFlag() const;

unsigned GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3);
uint8_t GetTernaryControlByte(GenTreeHWIntrinsic* second) const;

ClassLayout* GetLayout(Compiler* compiler) const;

Expand Down
Loading

0 comments on commit 6d3cb53

Please sign in to comment.