Skip to content

Commit

Permalink
Arm64/SVE: JitStress/JitStressRegs fixes (#102543)
Browse files Browse the repository at this point in the history
* handle case for FMA where falseReg == embMaskOp1Reg

* workaround because predicateRegister/vectorRegister are same

* When intrinsic is wrapped in ConditionalSelect, make sure to check its LOW_PREDICATE flag

* Mark AddAcross with HW_Flag_LowMaskedOperation

* double check if ConditionalSelect's op2 is hwintrinsic

* Mark Max with HW_Flag_LowMaskedOperation

* Mark MaxAcross with HW_Flag_LowMaskedOperation

* Mark MinNumber/MaxNumber/MinNumberAcross/MaxNumberAcross with HW_Flag_LowMaskedOperation

* Mark Min/MinAcross with HW_Flag_LowMaskedOperation

* fix the workaround for predicate vs. vector register

* fix the predicate mask preference

* Introduce INS_SCALABLE_OPTS_PREDICATE_MERGE_MOV

* jit format

* revert change to csproj

* remove assert that can not happen for FMA

if falseReg == embMaskOp2Reg, we simply generate:

```
            sel     z16.s, p7, z9.s, z10.s
            mla     z16.s, p7/m, z10.s, z11.s
```

Here `z10` holds `falseReg` and `embMaskOp2Reg`.

* revert a condition added for workaround of predicate == vector register

* remove the extra comment
  • Loading branch information
kunalspathak authored May 22, 2024
1 parent 437a9a3 commit 4f4ca87
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 24 deletions.
2 changes: 1 addition & 1 deletion src/coreclr/jit/codegenarm64test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6254,7 +6254,7 @@ void CodeGen::genArm64EmitterUnitTestsSve()

// IF_SVE_CW_4A
theEmitter->emitIns_R_R_R(INS_sve_mov, EA_SCALABLE, REG_V0, REG_P0, REG_V30, INS_OPTS_SCALABLE_H,
INS_SCALABLE_OPTS_PREDICATE_MERGE); // MOV <Zd>.<T>, <Pv>/M, <Zn>.<T>
INS_SCALABLE_OPTS_PREDICATE_MERGE_MOV); // MOV <Zd>.<T>, <Pv>/M, <Zn>.<T>
theEmitter->emitIns_R_R_R_R(INS_sve_sel, EA_SCALABLE, REG_V29, REG_P15, REG_V28, REG_V4, INS_OPTS_SCALABLE_D,
INS_SCALABLE_OPTS_UNPREDICATED); // SEL <Zd>.<T>, <Pv>, <Zn>.<T>, <Zm>.<T>
theEmitter->emitIns_R_R_R_R(INS_sve_sel, EA_SCALABLE, REG_V5, REG_P13, REG_V27, REG_V5, INS_OPTS_SCALABLE_S,
Expand Down
6 changes: 4 additions & 2 deletions src/coreclr/jit/emitarm64sve.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3785,7 +3785,9 @@ void emitter::emitInsSve_R_R_R(instruction ins,
// MOV is an alias for CPY, and is always the preferred disassembly.
ins = INS_sve_mov;
}
else if (sopt == INS_SCALABLE_OPTS_PREDICATE_MERGE)
// TODO-SVE: Change the below check to INS_SCALABLE_OPTS_PREDICATE_MERGE
// once predicate registers are present.
else if (sopt == INS_SCALABLE_OPTS_PREDICATE_MERGE_MOV)
{
assert(isVectorRegister(reg1));
assert(isPredicateRegister(reg2));
Expand Down Expand Up @@ -5891,7 +5893,7 @@ void emitter::emitInsSve_R_R_R_R(instruction ins,
{
// mov is a preferred alias for sel
return emitInsSve_R_R_R(INS_sve_mov, attr, reg1, reg2, reg3, opt,
INS_SCALABLE_OPTS_PREDICATE_MERGE);
INS_SCALABLE_OPTS_PREDICATE_MERGE_MOV);
}

assert(insOptsScalableStandard(opt));
Expand Down
26 changes: 17 additions & 9 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -576,8 +576,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
{
assert(instrIsRMW);
assert(HWIntrinsicInfo::IsFmaIntrinsic(intrinEmbMask.id));
assert(falseReg != embMaskOp1Reg);
assert(falseReg != embMaskOp2Reg);
assert(falseReg != embMaskOp3Reg);

// For FMA, the operation we are trying to perform is:
Expand Down Expand Up @@ -704,17 +702,27 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
// fmla target, P/m, embMaskOp2Reg, embMaskOp3Reg
//
// Note that, we just check if the targetReg/falseReg or targetReg/embMaskOp1Reg
// coincides or not. Other combination like falseReg/embMaskOp*Reg cannot happen
// because we marked embMaskOp*Reg as delayFree.
// coincides or not.

if (targetReg != falseReg)
{
// If falseReg value is not present in targetReg yet, move the inactive lanes
// into the targetReg using `sel`. Since this is RMW, the active lanes should
// have the value from embMaskOp1Reg
if (falseReg == embMaskOp1Reg)
{
// If falseReg value and embMaskOp1Reg value are same, then just mov the value
// to the target.

GetEmitter()->emitIns_R_R_R_R(INS_sve_sel, emitSize, targetReg, maskReg, embMaskOp1Reg,
falseReg, opt, INS_SCALABLE_OPTS_UNPREDICATED);
GetEmitter()->emitIns_Mov(INS_mov, emitTypeSize(node), targetReg, embMaskOp1Reg,
/* canSkip */ true);
}
else
{
// If falseReg value is not present in targetReg yet, move the inactive lanes
// into the targetReg using `sel`. Since this is RMW, the active lanes should
// have the value from embMaskOp1Reg

GetEmitter()->emitIns_R_R_R_R(INS_sve_sel, emitSize, targetReg, maskReg, embMaskOp1Reg,
falseReg, opt, INS_SCALABLE_OPTS_UNPREDICATED);
}
}
else if (targetReg != embMaskOp1Reg)
{
Expand Down
20 changes: 10 additions & 10 deletions src/coreclr/jit/hwintrinsiclistarm64sve.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
// SVE Intrinsics

// Sve
HARDWARE_INTRINSIC(Sve, Abs, -1, -1, false, {INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_fabs, INS_sve_fabs}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation)
HARDWARE_INTRINSIC(Sve, Abs, -1, -1, false, {INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_fabs, INS_sve_fabs}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, AbsoluteDifference, -1, -1, false, {INS_sve_sabd, INS_sve_uabd, INS_sve_sabd, INS_sve_uabd, INS_sve_sabd, INS_sve_uabd, INS_sve_sabd, INS_sve_uabd, INS_sve_fabd, INS_sve_fabd}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, Add, -1, -1, false, {INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_fadd, INS_sve_fadd}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, AddAcross, -1, 1, true, {INS_sve_saddv, INS_sve_uaddv, INS_sve_saddv, INS_sve_uaddv, INS_sve_saddv, INS_sve_uaddv, INS_sve_uaddv, INS_sve_uaddv, INS_sve_faddv, INS_sve_faddv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation)
HARDWARE_INTRINSIC(Sve, AddAcross, -1, 1, true, {INS_sve_saddv, INS_sve_uaddv, INS_sve_saddv, INS_sve_uaddv, INS_sve_saddv, INS_sve_uaddv, INS_sve_uaddv, INS_sve_uaddv, INS_sve_faddv, INS_sve_faddv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, AddSaturate, -1, 2, true, {INS_sve_sqadd, INS_sve_uqadd, INS_sve_sqadd, INS_sve_uqadd, INS_sve_sqadd, INS_sve_uqadd, INS_sve_sqadd, INS_sve_uqadd, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, And, -1, -1, false, {INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, AndAcross, -1, -1, false, {INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
Expand Down Expand Up @@ -93,14 +93,14 @@ HARDWARE_INTRINSIC(Sve, LoadVectorUInt16ZeroExtendToUInt32,
HARDWARE_INTRINSIC(Sve, LoadVectorUInt16ZeroExtendToUInt64, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1h, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, LoadVectorUInt32ZeroExtendToInt64, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, LoadVectorUInt32ZeroExtendToUInt64, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, Max, -1, -1, false, {INS_sve_smax, INS_sve_umax, INS_sve_smax, INS_sve_umax, INS_sve_smax, INS_sve_umax, INS_sve_smax, INS_sve_umax, INS_sve_fmax, INS_sve_fmax}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics)
HARDWARE_INTRINSIC(Sve, MaxAcross, -1, -1, false, {INS_sve_smaxv, INS_sve_umaxv, INS_sve_smaxv, INS_sve_umaxv, INS_sve_smaxv, INS_sve_umaxv, INS_sve_smaxv, INS_sve_umaxv, INS_sve_fmaxv, INS_sve_fmaxv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation)
HARDWARE_INTRINSIC(Sve, MaxNumber, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmaxnm, INS_sve_fmaxnm}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics)
HARDWARE_INTRINSIC(Sve, MaxNumberAcross, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmaxnmv, INS_sve_fmaxnmv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation)
HARDWARE_INTRINSIC(Sve, Min, -1, -1, false, {INS_sve_smin, INS_sve_umin, INS_sve_smin, INS_sve_umin, INS_sve_smin, INS_sve_umin, INS_sve_smin, INS_sve_umin, INS_sve_fmin, INS_sve_fmin}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics)
HARDWARE_INTRINSIC(Sve, MinAcross, -1, -1, false, {INS_sve_sminv, INS_sve_uminv, INS_sve_sminv, INS_sve_uminv, INS_sve_sminv, INS_sve_uminv, INS_sve_sminv, INS_sve_uminv, INS_sve_fminv, INS_sve_fminv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation)
HARDWARE_INTRINSIC(Sve, MinNumber, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fminnm, INS_sve_fminnm}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics)
HARDWARE_INTRINSIC(Sve, MinNumberAcross, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fminnmv, INS_sve_fminnmv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation)
HARDWARE_INTRINSIC(Sve, Max, -1, -1, false, {INS_sve_smax, INS_sve_umax, INS_sve_smax, INS_sve_umax, INS_sve_smax, INS_sve_umax, INS_sve_smax, INS_sve_umax, INS_sve_fmax, INS_sve_fmax}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, MaxAcross, -1, -1, false, {INS_sve_smaxv, INS_sve_umaxv, INS_sve_smaxv, INS_sve_umaxv, INS_sve_smaxv, INS_sve_umaxv, INS_sve_smaxv, INS_sve_umaxv, INS_sve_fmaxv, INS_sve_fmaxv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, MaxNumber, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmaxnm, INS_sve_fmaxnm}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, MaxNumberAcross, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmaxnmv, INS_sve_fmaxnmv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, Min, -1, -1, false, {INS_sve_smin, INS_sve_umin, INS_sve_smin, INS_sve_umin, INS_sve_smin, INS_sve_umin, INS_sve_smin, INS_sve_umin, INS_sve_fmin, INS_sve_fmin}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, MinAcross, -1, -1, false, {INS_sve_sminv, INS_sve_uminv, INS_sve_sminv, INS_sve_uminv, INS_sve_sminv, INS_sve_uminv, INS_sve_sminv, INS_sve_uminv, INS_sve_fminv, INS_sve_fminv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, MinNumber, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fminnm, INS_sve_fminnm}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, MinNumberAcross, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fminnmv, INS_sve_fminnmv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, Multiply, -1, 2, true, {INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_fmul, INS_sve_fmul}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, MultiplyAdd, -1, -1, false, {INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_FmaIntrinsic|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(Sve, MultiplyBySelectedScalar, -1, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmul, INS_sve_fmul}, HW_Category_SIMDByIndexedElement, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_LowVectorOperation)
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/instr.h
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ enum insScalableOpts : unsigned
INS_SCALABLE_OPTS_TO_PREDICATE, // Variants moving to a predicate from a vector (e.g. pmov)
INS_SCALABLE_OPTS_TO_VECTOR, // Variants moving to a vector from a predicate (e.g. pmov)
INS_SCALABLE_OPTS_BROADCAST, // Used to distinguish mov from cpy, where mov is an alias for both
INS_SCALABLE_OPTS_PREDICATE_MERGE_MOV, // Use to distinguish mov (predicated) from other variants
};

// Maps directly to the pattern used in SVE instructions such as cntb.
Expand Down
24 changes: 23 additions & 1 deletion src/coreclr/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1561,7 +1561,29 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
}
else if (HWIntrinsicInfo::IsMaskedOperation(intrin.id))
{
regMaskTP predMask = HWIntrinsicInfo::IsLowMaskedOperation(intrin.id) ? RBM_LOWMASK : RBM_ALLMASK;
regMaskTP predMask = RBM_ALLMASK;
if (intrin.id == NI_Sve_ConditionalSelect)
{
// If this is conditional select, make sure to check the embedded
// operation to determine the predicate mask.
assert(intrinsicTree->GetOperandCount() == 3);
assert(!HWIntrinsicInfo::IsLowMaskedOperation(intrin.id));

if (intrin.op2->OperIs(GT_HWINTRINSIC))
{
GenTreeHWIntrinsic* embOp2Node = intrin.op2->AsHWIntrinsic();
const HWIntrinsic intrinEmb(embOp2Node);
if (HWIntrinsicInfo::IsLowMaskedOperation(intrinEmb.id))
{
predMask = RBM_LOWMASK;
}
}
}
else if (HWIntrinsicInfo::IsLowMaskedOperation(intrin.id))
{
predMask = RBM_LOWMASK;
}

srcCount += BuildOperandUses(intrin.op1, predMask);
}
else if (intrinsicTree->OperIsMemoryLoadOrStore())
Expand Down
Loading

0 comments on commit 4f4ca87

Please sign in to comment.