Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arm64/Sve: Implement SVE Math *Multiply* APIs #102007

Merged
merged 30 commits into from
May 11, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
97373ca
Add *Fused* APIs
kunalspathak May 6, 2024
4e14098
fix an assert in morph
kunalspathak May 7, 2024
3fb9dea
Map APIs to instructions
kunalspathak May 7, 2024
600391a
Add test cases
kunalspathak May 8, 2024
67e4d4d
handle fused* instructions
kunalspathak May 8, 2024
54899b2
jit format
kunalspathak May 8, 2024
e4a53ae
Added MultiplyAdd/MultiplySubtract
kunalspathak May 8, 2024
bfad7b7
Add mapping of API to instruction
kunalspathak May 8, 2024
100f289
Add test cases
kunalspathak May 8, 2024
8ac1840
Handle mov Z, Z instruction
kunalspathak May 9, 2024
9eb195e
Reuse GetResultOpNumForRmwIntrinsic() for arm64
kunalspathak May 9, 2024
c182d0d
Reuse HW_Flag_FmaIntrinsic for arm64
kunalspathak May 9, 2024
62ea159
Mark FMA APIs as HW_Flag_FmaIntrinsic
kunalspathak May 9, 2024
28a49cb
Handle FMA in LSRA and codegen
kunalspathak May 9, 2024
722dd55
Remove the SpecialCodeGen flag from selectedScalar
kunalspathak May 9, 2024
229f78f
address some more scenarios
kunalspathak May 10, 2024
a21439f
jit format
kunalspathak May 10, 2024
6a01ca4
Add MultiplyBySelectedScalar
kunalspathak May 10, 2024
318cbf3
Map the API to the instruction
kunalspathak May 10, 2024
e3fc830
fix a bug where *Indexed API used with ConditionalSelect were failing
kunalspathak May 10, 2024
1ca5539
unpredicated movprfx should not send opt
kunalspathak May 10, 2024
eb41e1d
Add the missing flags for Subtract/Multiply
kunalspathak May 10, 2024
7874f25
Added tests for MultiplyBySelectedScalar
kunalspathak May 10, 2024
f756afb
fixes to test cases
kunalspathak May 10, 2024
2904934
fix the parameter for selectedScalar test
kunalspathak May 10, 2024
53d29a0
Merge remote-tracking branch 'origin/main' into sve_math6
kunalspathak May 10, 2024
98ac0ce
jit format
kunalspathak May 10, 2024
0f89e10
Contain(op3) of CndSel if op1 is AllTrueMask
kunalspathak May 10, 2024
8e928ec
Handle FMA properly
kunalspathak May 10, 2024
c713d31
added assert
kunalspathak May 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,8 @@ void HWIntrinsicInfo::lookupImmBounds(
case NI_AdvSimd_Arm64_StoreSelectedScalarVector128x4:
case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128:
case NI_AdvSimd_Arm64_InsertSelectedScalar:
case NI_Sve_FusedMultiplyAddBySelectedScalar:
case NI_Sve_FusedMultiplySubtractBySelectedScalar:
immUpperBound = Compiler::getSIMDVectorLength(simdSize, baseType) - 1;
break;

Expand Down
59 changes: 58 additions & 1 deletion src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -417,10 +417,16 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
regNumber maskReg = op1Reg;
regNumber embMaskOp1Reg = REG_NA;
regNumber embMaskOp2Reg = REG_NA;
regNumber embMaskOp3Reg = REG_NA;
regNumber falseReg = op3Reg;

switch (intrinEmbMask.numOperands)
{
case 3:
assert(intrinEmbMask.op3 != nullptr);
embMaskOp3Reg = intrinEmbMask.op3->GetRegNum();
FALLTHROUGH;

case 2:
assert(intrinEmbMask.op2 != nullptr);
embMaskOp2Reg = intrinEmbMask.op2->GetRegNum();
Expand All @@ -438,6 +444,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
switch (intrinEmbMask.numOperands)
{
case 1:
{
assert(!instrIsRMW);

if (targetReg != falseReg)
Expand Down Expand Up @@ -488,9 +495,10 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)

GetEmitter()->emitIns_R_R_R(insEmbMask, emitSize, targetReg, maskReg, embMaskOp1Reg, opt);
break;
}

case 2:

{
assert(instrIsRMW);

if (intrin.op3->IsVectorZero())
Expand Down Expand Up @@ -560,7 +568,50 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
}

break;
}
case 3:
{
assert(instrIsRMW);
assert(targetReg != falseReg);
assert(targetReg != embMaskOp2Reg);
assert(targetReg != embMaskOp3Reg);
assert(!HWIntrinsicInfo::IsOptionalEmbeddedMaskedOperation(intrinEmbMask.id));

if (intrin.op3->IsVectorZero())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be asserting that intrin.op3 is contained?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added `

{
// If `falseReg` is zero, then move the first operand of `intrinEmbMask` in the
// destination using /Z.

assert(targetReg != embMaskOp2Reg);
GetEmitter()->emitIns_R_R_R(INS_sve_movprfx, emitSize, targetReg, maskReg, embMaskOp1Reg, opt);

// Finally, perform the actual "predicated" operation so that `targetReg` is the first operand
// `embMaskOp2Reg` is the second operand and `embMaskOp3Reg` is the third operand.
GetEmitter()->emitIns_R_R_R_R(insEmbMask, emitSize, targetReg, maskReg, embMaskOp2Reg,
embMaskOp3Reg, opt);
}
else
{
// If the instruction just has "predicated" version, then move the "embMaskOp1Reg"
// into targetReg. Next, do the predicated operation on the targetReg and last,
// use "sel" to select the active lanes based on mask, and set inactive lanes
// to falseReg.

assert(HWIntrinsicInfo::IsEmbeddedMaskedOperation(intrinEmbMask.id));

if (targetReg != embMaskOp1Reg)
{
GetEmitter()->emitIns_R_R(INS_sve_movprfx, EA_SCALABLE, targetReg, embMaskOp1Reg);
}

GetEmitter()->emitIns_R_R_R_R(insEmbMask, emitSize, targetReg, maskReg, embMaskOp2Reg,
embMaskOp3Reg, opt);

GetEmitter()->emitIns_R_R_R_R(INS_sve_sel, emitSize, targetReg, maskReg, targetReg, falseReg,
opt, INS_SCALABLE_OPTS_UNPREDICATED);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there an assumption being made about the instruction being RMW here?

FMLA encodes 4 registers (Zda, Pg, Zn, and Zm) where Zda is both the source and destination and the operation is functionally similar to Zda += (Zn * Zm) (with only a single rounding operation).

Given some Zda = ConditionalSelect(Pg, FusedMultiplyAdd(Zda, Zn, Zm), Zda) it can then be encoded as simply:

fmla Zda, Pg/M, Zn, Zm

Given some Zda = ConditionalSelect(Pg, FusedMultiplyAdd(Zda, Zn, Zm), merge) it can then be encoded as simply:

movprfx Zda, Pg/M, merge
fmla Zda, Pg/M, Zn, Zm

Given some Zda = ConditionalSelect(Pg, FusedMultiplyAdd(Zda, Zn, Zm), Zero) it can then be encoded as simply:

movprfx Zda, Pg/Z, Zda
fmla Zda, Pg/M, Zn, Zm

There are then similar versions possible using fmad when the multiplier is the source and destination (op2Reg == tgtReg or op3Reg == tgtReg).


We should actually never need sel for this case, but only need complex generation if tgtReg is unique from all input registers (including the merge) and we're merging with a non-zero value, such as dest = ConditionalSelect(Pg, FusedMultiplyAdd(Zda, Zn, Zm), merge):

mov dest, Zda
movprfx dest, Pg/M, merge
fmla dest, Pg/M, Zn, Zm

This ends up being different from the other fallbacks that do use sel specifically because it's RMW and requires predication (that is there is no fmla (unpredicated)).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main goal of using ins (unpredicated); sel in the other case is because it allows a 2 instruction sequence as the worst case.

In this case, we at worst need a 3 instruction sequence due to the required predication on the instruction. Thus, it becomes better to use mov; movprfx (predicated); ins (predicated) instead as it can allow mov to be elided by the register renamer.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

such as dest = ConditionalSelect(Pg, FusedMultiplyAdd(Zda, Zn, Zm), merge):

For the similar reasoning mentioned in #100743 (comment) (where we should only movprfx the inactive lanes from merge -> dest, the code should be:

mov dest, Zda
fmla dest, Pg/M, Zn, Zm
sel dest, Pg/M, dest, merge

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I misinterpreted the value of Pg/M as AllTrue. Spoke to @tannergooding offline and we would like to generate:

sel dest, Pg/M, Zda, merge
fmla dest, Pg/M, Zn, Zm

}
break;
}
default:
unreached();
}
Expand Down Expand Up @@ -627,6 +678,12 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
INS_SCALABLE_OPTS_UNPREDICATED);
}
break;
case 4:
kunalspathak marked this conversation as resolved.
Show resolved Hide resolved
assert(!isRMW);
GetEmitter()->emitIns_R_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, op3Reg, opt,
INS_SCALABLE_OPTS_UNPREDICATED);
break;

default:
unreached();
}
Expand Down
6 changes: 6 additions & 0 deletions src/coreclr/jit/hwintrinsiclistarm64sve.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ HARDWARE_INTRINSIC(Sve, CreateWhileLessThanOrEqualMask32Bit,
HARDWARE_INTRINSIC(Sve, CreateWhileLessThanOrEqualMask64Bit, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_whilele, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(Sve, CreateWhileLessThanOrEqualMask8Bit, -1, 2, false, {INS_invalid, INS_sve_whilele, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(Sve, Divide, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_sdiv, INS_sve_udiv, INS_sve_sdiv, INS_sve_udiv, INS_sve_fdiv, INS_sve_fdiv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, FusedMultiplyAdd, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmla, INS_sve_fmla}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are always using FMLA for these. Will there be cases where FMAD might be more optimal based on register usage? If so, raise an issue to track it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, I am just preferencing op1 as a targetPrefUse, in other words telling LSRA to use op1 as the targetReg and mark the registers for other operands as delayFree. With that, using FMLA will always be optimal. @tannergooding - please correct if I missed anything here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, that sounds reasonable.
There might be scenarios where FMAD is still optimal - those where op2 is never reused in the C#, but op1 is reused. Using FMLA would avoid having to mov op1 into a temp.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would definitely expect us to have some logic around picking FMLA vs FMAD.

The x64 logic is even more complex because it has to handle the RMW consideration (should the tgtPrefUse be the addend or multiplier), but it also needs to consider which memory operand should be contained (since it supports embedded loads). That logic is here: https://github.com/dotnet/runtime/blob/main/src/coreclr/jit/lowerxarch.cpp#L9823 and you'll note that it uses the node->GetResultOpNumForRmwIntrinsic to determine which of op1, op2, or op3 is both an input and output or otherwise which is last use. It uses this to ensure the right containment choices are being made.

x64 then repeats this logic again in LSRA to actually set the tgtPrefUse: https://github.com/dotnet/runtime/blob/main/src/coreclr/jit/lsraxarch.cpp#L2432 and then again in codegen to pick which instruction form it should use: https://github.com/dotnet/runtime/blob/main/src/coreclr/jit/hwintrinsiccodegenxarch.cpp#L2947

I expect that Arm64 just needs to mirror the LSRA and codegen logic (ignoring any bits relevant to containment) and picking FMLA vs FMAD (rather than 231 vs 213, respectively)

HARDWARE_INTRINSIC(Sve, FusedMultiplyAddBySelectedScalar, -1, 4, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmla, INS_sve_fmla}, HW_Category_SIMDByIndexedElement, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_HasRMWSemantics)
HARDWARE_INTRINSIC(Sve, FusedMultiplyAddNegated, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fnmla, INS_sve_fnmla}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, FusedMultiplySubtract, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmls, INS_sve_fmls}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, FusedMultiplySubtractBySelectedScalar, -1, 4, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmls, INS_sve_fmls}, HW_Category_SIMDByIndexedElement, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_HasRMWSemantics)
HARDWARE_INTRINSIC(Sve, FusedMultiplySubtractNegated, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fnmls, INS_sve_fnmls}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, LoadVector, -1, 2, true, {INS_sve_ld1b, INS_sve_ld1b, INS_sve_ld1h, INS_sve_ld1h, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1d, INS_sve_ld1d, INS_sve_ld1w, INS_sve_ld1d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, LoadVectorByteZeroExtendToInt16, -1, 2, false, {INS_invalid, INS_invalid, INS_sve_ld1b, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, LoadVectorByteZeroExtendToInt32, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1b, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
Expand Down
10 changes: 10 additions & 0 deletions src/coreclr/jit/lowerarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3352,6 +3352,16 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}

case NI_Sve_FusedMultiplyAddBySelectedScalar:
case NI_Sve_FusedMultiplySubtractBySelectedScalar:
assert(hasImmediateOperand);
assert(varTypeIsIntegral(intrin.op4));
if (intrin.op4->IsCnsIntOrI())
{
MakeSrcContained(node, intrin.op4);
}
break;

default:
unreached();
}
Expand Down
24 changes: 21 additions & 3 deletions src/coreclr/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1772,7 +1772,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
// then record delay-free for operands as well as the "merge" value
GenTreeHWIntrinsic* intrinEmbOp2 = intrin.op2->AsHWIntrinsic();
size_t numArgs = intrinEmbOp2->GetOperandCount();
assert((numArgs == 1) || (numArgs == 2));
assert((numArgs == 1) || (numArgs == 2) || (numArgs == 3));
tgtPrefUse = BuildUse(intrinEmbOp2->Op(1));
srcCount += 1;

Expand All @@ -1792,7 +1792,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou

assert(intrin.op1 != nullptr);

bool forceOp2DelayFree = false;
bool forceOp2DelayFree = false;
regMaskTP candidates = RBM_NONE;
if ((intrin.id == NI_Vector64_GetElement) || (intrin.id == NI_Vector128_GetElement))
{
if (!intrin.op2->IsCnsIntOrI() && (!intrin.op1->isContained() || intrin.op1->OperIsLocal()))
Expand All @@ -1815,6 +1816,22 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
}
}

if ((intrin.id == NI_Sve_FusedMultiplyAddBySelectedScalar) ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do these this require special code here?

Copy link
Member Author

@kunalspathak kunalspathak May 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because as per FMLA (indexed), Zm has to be in lower vector registers.

image

We have similar code for AdvSimd too and most likely, if I see more patterns in future, I will combine this code with it.

if ((intrin.category == HW_Category_SIMDByIndexedElement) && (genTypeSize(intrin.baseType) == 2))
{
// Some "Advanced SIMD scalar x indexed element" and "Advanced SIMD vector x indexed element" instructions (e.g.
// "MLA (by element)") have encoding that restricts what registers that can be used for the indexed element when
// the element size is H (i.e. 2 bytes).
assert(intrin.op2 != nullptr);
if ((intrin.op4 != nullptr) || ((intrin.op3 != nullptr) && !hasImmediateOperand))
{
if (isRMW)
{
srcCount += BuildDelayFreeUses(intrin.op2, nullptr);
srcCount += BuildDelayFreeUses(intrin.op3, nullptr, RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS);
}
else
{
srcCount += BuildOperandUses(intrin.op2);
srcCount += BuildOperandUses(intrin.op3, RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS);
}
if (intrin.op4 != nullptr)
{
assert(hasImmediateOperand);
assert(varTypeIsIntegral(intrin.op4));
srcCount += BuildOperandUses(intrin.op4);
}
}

(intrin.id == NI_Sve_FusedMultiplySubtractBySelectedScalar))
{
// If this is common pattern, then we will add a flag in the table, but for now, just check for specific
// intrinsics
if (intrin.baseType == TYP_DOUBLE)
{
candidates = RBM_SVE_INDEXED_D_ELEMENT_ALLOWED_REGS;
}
else
{
assert(intrin.baseType == TYP_FLOAT);
candidates = RBM_SVE_INDEXED_S_ELEMENT_ALLOWED_REGS;
}
}

if ((intrin.id == NI_Sve_ConditionalSelect) && (intrin.op2->IsEmbMaskOp()) &&
(intrin.op2->isRMWHWIntrinsic(compiler)))
{
Expand Down Expand Up @@ -1845,7 +1862,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou

if (intrin.op3 != nullptr)
{
srcCount += isRMW ? BuildDelayFreeUses(intrin.op3, intrin.op1) : BuildOperandUses(intrin.op3);
srcCount += isRMW ? BuildDelayFreeUses(intrin.op3, intrin.op1, candidates)
: BuildOperandUses(intrin.op3, candidates);

if (intrin.op4 != nullptr)
{
Expand Down
5 changes: 2 additions & 3 deletions src/coreclr/jit/morph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10717,10 +10717,9 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}

unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType());
GenTreeHWIntrinsic* cvtOp2 = op2->AsHWIntrinsic();
unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType());

if ((genTypeSize(cvtOp2->GetSimdBaseType()) != simdBaseTypeSize))
if (!op2->OperIsHWIntrinsic() || (genTypeSize(op2->AsHWIntrinsic()->GetSimdBaseType()) != simdBaseTypeSize))
{
// We need the operand to be the same kind of mask; otherwise
// the bitwise operation can differ in how it performs
Expand Down
4 changes: 3 additions & 1 deletion src/coreclr/jit/targetarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -376,9 +376,11 @@
// For arm64, this is the maximum prolog establishment pre-indexed (that is SP pre-decrement) offset.
#define STACK_PROBE_BOUNDARY_THRESHOLD_BYTES 512

// Some "Advanced SIMD scalar x indexed element" and "Advanced SIMD vector x indexed element" instructions (e.g. "MLA (by element)")
// Some "Advanced SIMD / SVE scalar x indexed element" and "Advanced SIMD / SVE vector x indexed element" instructions (e.g. "MLA (by element)")
// have encoding that restricts what registers that can be used for the indexed element when the element size is H (i.e. 2 bytes).
#define RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS (RBM_V0|RBM_V1|RBM_V2|RBM_V3|RBM_V4|RBM_V5|RBM_V6|RBM_V7|RBM_V8|RBM_V9|RBM_V10|RBM_V11|RBM_V12|RBM_V13|RBM_V14|RBM_V15)
#define RBM_SVE_INDEXED_S_ELEMENT_ALLOWED_REGS (RBM_V0|RBM_V1|RBM_V2|RBM_V3|RBM_V4|RBM_V5|RBM_V6|RBM_V7)
#define RBM_SVE_INDEXED_D_ELEMENT_ALLOWED_REGS RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS

#define REG_ZERO_INIT_FRAME_REG1 REG_R9
#define REG_ZERO_INIT_FRAME_REG2 REG_R10
Expand Down
Loading
Loading