Skip to content

Commit

Permalink
Expose helper APIS for GetLower/Upper and WithLower/Upper (#83982)
Browse files Browse the repository at this point in the history
* Expose and use a gtNewSimdGetLowerNode and gtNewSimdGetUpperNode

* Expose and use a gtNewSimdWithLowerNode and gtNewSimdWithUpperNode

* Apply formatting patch

* Ensure op1 and op2 are passed for WithLower/Upper

* Ensure we aren't creating unnecessary idx nodes

* Ensure args are popped in the right order

* Ensure Vector512.WithUpper/Lower are handled as intrinsic

* Ensure Vector512.GetLower/Upper and WithLower/Upper are fully hooked up

* Applying formatting patch

* Fix a copy/paste error

* Move NI_Vector128_GetUpper to be handled in codegen to improve emitted code

* Fix an assert
  • Loading branch information
tannergooding authored Mar 30, 2023
1 parent e348b5c commit 57ae91c
Show file tree
Hide file tree
Showing 16 changed files with 613 additions and 224 deletions.
1 change: 1 addition & 0 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5670,6 +5670,7 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
case NI_SSE41_X64_Extract:
case NI_AVX_ExtractVector128:
case NI_AVX2_ExtractVector128:
case NI_AVX512F_ExtractVector256:
{
// These intrinsics are "ins reg/mem, xmm, imm8"
ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
Expand Down
26 changes: 26 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -2700,6 +2700,18 @@ class Compiler
unsigned simdSize,
bool isSimdAsHWIntrinsic);

GenTree* gtNewSimdGetLowerNode(var_types type,
GenTree* op1,
CorInfoType simdBaseJitType,
unsigned simdSize,
bool isSimdAsHWIntrinsic);

GenTree* gtNewSimdGetUpperNode(var_types type,
GenTree* op1,
CorInfoType simdBaseJitType,
unsigned simdSize,
bool isSimdAsHWIntrinsic);

GenTree* gtNewSimdLoadNode(
var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic);

Expand Down Expand Up @@ -2773,6 +2785,20 @@ class Compiler
unsigned simdSize,
bool isSimdAsHWIntrinsic);

GenTree* gtNewSimdWithLowerNode(var_types type,
GenTree* op1,
GenTree* op2,
CorInfoType simdBaseJitType,
unsigned simdSize,
bool isSimdAsHWIntrinsic);

GenTree* gtNewSimdWithUpperNode(var_types type,
GenTree* op1,
GenTree* op2,
CorInfoType simdBaseJitType,
unsigned simdSize,
bool isSimdAsHWIntrinsic);

GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID);
GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID);
GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(var_types type,
Expand Down
13 changes: 13 additions & 0 deletions src/coreclr/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -3350,6 +3350,7 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
// Note: vextractf128 has a 128-bit output (register or memory) but a 256-bit input (register).
// vinsertf128 is the inverse with a 256-bit output (register), a 256-bit input(register),
// and a 128-bit input (register or memory).
// Similarly, vextractf64x4 has a 256-bit output and 128-bit input and vinsertf64x4 the inverse
// This method is mainly used for such instructions to return the appropriate memory operand
// size, otherwise returns the regular operand size of the instruction.

Expand Down Expand Up @@ -3492,6 +3493,18 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
return EA_16BYTE;
}

case INS_vextractf32x8:
case INS_vextracti32x8:
case INS_vextractf64x4:
case INS_vextracti64x4:
case INS_vinsertf32x8:
case INS_vinserti32x8:
case INS_vinsertf64x4:
case INS_vinserti64x4:
{
return EA_32BYTE;
}

case INS_movddup:
{
if (defaultSize == 32)
Expand Down
84 changes: 73 additions & 11 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6357,7 +6357,11 @@ void emitter::emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regN
case INS_pextrw_sse41:
case INS_extractps:
case INS_vextractf128:
case INS_vextractf32x8:
case INS_vextractf64x4:
case INS_vextracti128:
case INS_vextracti64x4:
case INS_vextracti32x8:
case INS_shld:
case INS_shrd:
{
Expand Down Expand Up @@ -6841,7 +6845,11 @@ void emitter::emitIns_R_R_R_I(
case INS_pextrw_sse41:
case INS_extractps:
case INS_vextractf128:
case INS_vextractf32x8:
case INS_vextractf64x4:
case INS_vextracti128:
case INS_vextracti64x4:
case INS_vextracti32x8:
{
code = insCodeMR(ins);
break;
Expand Down Expand Up @@ -10724,10 +10732,30 @@ void emitter::emitDispIns(

case IF_AWR_RRD_CNS:
{
if ((ins == INS_vextracti128) || (ins == INS_vextractf128))
switch (ins)
{
// vextracti/f128 extracts 128-bit data, so we fix sstr as "xmm ptr"
sstr = codeGen->genSizeStr(EA_ATTR(16));
case INS_vextractf128:
case INS_vextracti128:
{
// vextracti/f128 extracts 128-bit data, so we fix sstr as "xmm ptr"
sstr = codeGen->genSizeStr(EA_ATTR(16));
break;
}

case INS_vextractf32x8:
case INS_vextractf64x4:
case INS_vextracti64x4:
case INS_vextracti32x8:
{
// vextracti/f*x* extracts 256-bit data, so we fix sstr as "ymm ptr"
sstr = codeGen->genSizeStr(EA_ATTR(32));
break;
}

default:
{
break;
}
}

printf(sstr);
Expand Down Expand Up @@ -11121,6 +11149,7 @@ void emitter::emitDispIns(
attr = EA_32BYTE;
break;
}

case INS_vinsertf128:
case INS_vinserti128:
{
Expand Down Expand Up @@ -11173,6 +11202,15 @@ void emitter::emitDispIns(
break;
}

case INS_vextractf32x8:
case INS_vextractf64x4:
case INS_vextracti64x4:
case INS_vextracti32x8:
{
tgtAttr = EA_32BYTE;
break;
}

case INS_extractps:
case INS_pextrb:
case INS_pextrw:
Expand Down Expand Up @@ -11289,10 +11327,30 @@ void emitter::emitDispIns(

case IF_MWR_RRD_CNS:
{
if ((ins == INS_vextracti128) || (ins == INS_vextractf128))
switch (ins)
{
// vextracti/f128 extracts 128-bit data, so we fix sstr as "xmm ptr"
sstr = codeGen->genSizeStr(EA_ATTR(16));
case INS_vextractf128:
case INS_vextracti128:
{
// vextracti/f128 extracts 128-bit data, so we fix sstr as "xmm ptr"
sstr = codeGen->genSizeStr(EA_ATTR(16));
break;
}

case INS_vextractf32x8:
case INS_vextractf64x4:
case INS_vextracti64x4:
case INS_vextracti32x8:
{
// vextracti/f*x* extracts 256-bit data, so we fix sstr as "ymm ptr"
sstr = codeGen->genSizeStr(EA_ATTR(32));
break;
}

default:
{
break;
}
}

printf(sstr);
Expand Down Expand Up @@ -16272,12 +16330,12 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
break;

case IF_MWR_RRD_CNS:
assert(ins == INS_vextracti128 || ins == INS_vextractf128);
assert((ins == INS_vextractf128) || (ins == INS_vextractf32x8) || (ins == INS_vextractf64x4) ||
(ins == INS_vextracti128) || (ins == INS_vextracti32x8) || (ins == INS_vextracti64x4));
assert(UseSimdEncoding());
emitGetInsDcmCns(id, &cnsVal);
code = insCodeMR(ins);
// only AVX2 vextracti128 and AVX vextractf128 can reach this path,
// they do not need VEX.vvvv to encode the register operand
// we do not need VEX.vvvv to encode the register operand
dst = emitOutputCV(dst, id, code, &cnsVal);
sz = emitSizeOfInsDsc(id);
break;
Expand Down Expand Up @@ -17849,12 +17907,16 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_vperm2i128:
case INS_vperm2f128:
case INS_vextractf128:
case INS_vextractf32x8:
case INS_vextractf64x4:
case INS_vextracti128:
case INS_vextracti32x8:
case INS_vextracti64x4:
case INS_vinsertf128:
case INS_vinserti128:
case INS_vinsertf64x4:
case INS_vinserti64x4:
case INS_vinsertf32x8:
case INS_vinserti128:
case INS_vinserti64x4:
case INS_vinserti32x8:
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += PERFSCORE_LATENCY_3C;
Expand Down
Loading

0 comments on commit 57ae91c

Please sign in to comment.