From 57ae91cf6fc5672e34705b1a272cf268761d505a Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 29 Mar 2023 21:24:24 -0700 Subject: [PATCH] Expose helper APIS for GetLower/Upper and WithLower/Upper (#83982) * Expose and use a gtNewSimdGetLowerNode and gtNewSimdGetUpperNode * Expose and use a gtNewSimdWithLowerNode and gtNewSimdWithUpperNode * Apply formatting patch * Ensure op1 and op2 are passed for WithLower/Upper * Ensure we aren't creating unnecessary idx nodes * Ensure args are popped in the right order * Ensure Vector512.WithUpper/Lower are handled as intrinsic * Ensure Vector512.GetLower/Upper and WithLower/Upper are fully hooked up * Applying formatting patch * Fix a copy/paste error * Move NI_Vector128_GetUpper to be handled in codegen to improve emitted code * Fix an assert --- src/coreclr/jit/codegenxarch.cpp | 1 + src/coreclr/jit/compiler.h | 26 ++ src/coreclr/jit/emit.h | 13 + src/coreclr/jit/emitxarch.cpp | 84 +++++- src/coreclr/jit/gentree.cpp | 169 ++++++++++-- src/coreclr/jit/hwintrinsicarm64.cpp | 56 ++-- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 7 + src/coreclr/jit/hwintrinsiclistarm64.h | 6 +- src/coreclr/jit/hwintrinsiclistxarch.h | 13 +- src/coreclr/jit/hwintrinsicxarch.cpp | 100 ++++++- src/coreclr/jit/instrsxarch.h | 4 + src/coreclr/jit/lowerarmarch.cpp | 22 ++ src/coreclr/jit/lowerxarch.cpp | 245 ++++++++++++------ .../System/Runtime/Intrinsics/Vector128.cs | 28 +- .../System/Runtime/Intrinsics/Vector256.cs | 60 +---- .../System/Runtime/Intrinsics/Vector512.cs | 3 + 16 files changed, 613 insertions(+), 224 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 5205c46b92756..e89b18b3b229a 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -5670,6 +5670,7 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree) case NI_SSE41_X64_Extract: case NI_AVX_ExtractVector128: case NI_AVX2_ExtractVector128: + case NI_AVX512F_ExtractVector256: { // These intrinsics are "ins reg/mem, xmm, imm8" ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 85ec2b3c1b946..03c317da5c42e 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -2700,6 +2700,18 @@ class Compiler unsigned simdSize, bool isSimdAsHWIntrinsic); + GenTree* gtNewSimdGetLowerNode(var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic); + + GenTree* gtNewSimdGetUpperNode(var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic); + GenTree* gtNewSimdLoadNode( var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic); @@ -2773,6 +2785,20 @@ class Compiler unsigned simdSize, bool isSimdAsHWIntrinsic); + GenTree* gtNewSimdWithLowerNode(var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic); + + GenTree* gtNewSimdWithUpperNode(var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic); + GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID); GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID); GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(var_types type, diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index f137cff3c306e..1f9fa295c91e4 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -3350,6 +3350,7 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id) // Note: vextractf128 has a 128-bit output (register or memory) but a 256-bit input (register). // vinsertf128 is the inverse with a 256-bit output (register), a 256-bit input(register), // and a 128-bit input (register or memory). +// Similarly, vextractf64x4 has a 256-bit output and 128-bit input and vinsertf64x4 the inverse // This method is mainly used for such instructions to return the appropriate memory operand // size, otherwise returns the regular operand size of the instruction. @@ -3492,6 +3493,18 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id) return EA_16BYTE; } + case INS_vextractf32x8: + case INS_vextracti32x8: + case INS_vextractf64x4: + case INS_vextracti64x4: + case INS_vinsertf32x8: + case INS_vinserti32x8: + case INS_vinsertf64x4: + case INS_vinserti64x4: + { + return EA_32BYTE; + } + case INS_movddup: { if (defaultSize == 32) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index dfbf51e22afdb..586dcbdf2892c 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -6357,7 +6357,11 @@ void emitter::emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regN case INS_pextrw_sse41: case INS_extractps: case INS_vextractf128: + case INS_vextractf32x8: + case INS_vextractf64x4: case INS_vextracti128: + case INS_vextracti64x4: + case INS_vextracti32x8: case INS_shld: case INS_shrd: { @@ -6841,7 +6845,11 @@ void emitter::emitIns_R_R_R_I( case INS_pextrw_sse41: case INS_extractps: case INS_vextractf128: + case INS_vextractf32x8: + case INS_vextractf64x4: case INS_vextracti128: + case INS_vextracti64x4: + case INS_vextracti32x8: { code = insCodeMR(ins); break; @@ -10724,10 +10732,30 @@ void emitter::emitDispIns( case IF_AWR_RRD_CNS: { - if ((ins == INS_vextracti128) || (ins == INS_vextractf128)) + switch (ins) { - // vextracti/f128 extracts 128-bit data, so we fix sstr as "xmm ptr" - sstr = codeGen->genSizeStr(EA_ATTR(16)); + case INS_vextractf128: + case INS_vextracti128: + { + // vextracti/f128 extracts 128-bit data, so we fix sstr as "xmm ptr" + sstr = codeGen->genSizeStr(EA_ATTR(16)); + break; + } + + case INS_vextractf32x8: + case INS_vextractf64x4: + case INS_vextracti64x4: + case INS_vextracti32x8: + { + // vextracti/f*x* extracts 256-bit data, so we fix sstr as "ymm ptr" + sstr = codeGen->genSizeStr(EA_ATTR(32)); + break; + } + + default: + { + break; + } } printf(sstr); @@ -11121,6 +11149,7 @@ void emitter::emitDispIns( attr = EA_32BYTE; break; } + case INS_vinsertf128: case INS_vinserti128: { @@ -11173,6 +11202,15 @@ void emitter::emitDispIns( break; } + case INS_vextractf32x8: + case INS_vextractf64x4: + case INS_vextracti64x4: + case INS_vextracti32x8: + { + tgtAttr = EA_32BYTE; + break; + } + case INS_extractps: case INS_pextrb: case INS_pextrw: @@ -11289,10 +11327,30 @@ void emitter::emitDispIns( case IF_MWR_RRD_CNS: { - if ((ins == INS_vextracti128) || (ins == INS_vextractf128)) + switch (ins) { - // vextracti/f128 extracts 128-bit data, so we fix sstr as "xmm ptr" - sstr = codeGen->genSizeStr(EA_ATTR(16)); + case INS_vextractf128: + case INS_vextracti128: + { + // vextracti/f128 extracts 128-bit data, so we fix sstr as "xmm ptr" + sstr = codeGen->genSizeStr(EA_ATTR(16)); + break; + } + + case INS_vextractf32x8: + case INS_vextractf64x4: + case INS_vextracti64x4: + case INS_vextracti32x8: + { + // vextracti/f*x* extracts 256-bit data, so we fix sstr as "ymm ptr" + sstr = codeGen->genSizeStr(EA_ATTR(32)); + break; + } + + default: + { + break; + } } printf(sstr); @@ -16272,12 +16330,12 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) break; case IF_MWR_RRD_CNS: - assert(ins == INS_vextracti128 || ins == INS_vextractf128); + assert((ins == INS_vextractf128) || (ins == INS_vextractf32x8) || (ins == INS_vextractf64x4) || + (ins == INS_vextracti128) || (ins == INS_vextracti32x8) || (ins == INS_vextracti64x4)); assert(UseSimdEncoding()); emitGetInsDcmCns(id, &cnsVal); code = insCodeMR(ins); - // only AVX2 vextracti128 and AVX vextractf128 can reach this path, - // they do not need VEX.vvvv to encode the register operand + // we do not need VEX.vvvv to encode the register operand dst = emitOutputCV(dst, id, code, &cnsVal); sz = emitSizeOfInsDsc(id); break; @@ -17849,12 +17907,16 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vperm2i128: case INS_vperm2f128: case INS_vextractf128: + case INS_vextractf32x8: + case INS_vextractf64x4: case INS_vextracti128: + case INS_vextracti32x8: + case INS_vextracti64x4: case INS_vinsertf128: - case INS_vinserti128: case INS_vinsertf64x4: - case INS_vinserti64x4: case INS_vinsertf32x8: + case INS_vinserti128: + case INS_vinserti64x4: case INS_vinserti32x8: result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += PERFSCORE_LATENCY_3C; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 027bc375cf076..47ce69981e1d8 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19234,6 +19234,7 @@ bool GenTree::isContainableHWIntrinsic() const case NI_AVX2_ConvertToInt32: case NI_AVX2_ConvertToUInt32: case NI_AVX2_ExtractVector128: + case NI_AVX512F_ExtractVector256: { // These HWIntrinsic operations are contained as part of a store return true; @@ -21815,6 +21816,66 @@ GenTree* Compiler::gtNewSimdGetElementNode(var_types type, return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsicId, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); } +GenTree* Compiler::gtNewSimdGetLowerNode( + var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic) +{ + var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + assert(varTypeIsArithmetic(simdBaseType)); + + NamedIntrinsic intrinsicId = NI_Illegal; + +#if defined(TARGET_XARCH) + if (simdSize == 32) + { + assert(type == TYP_SIMD16); + intrinsicId = NI_Vector256_GetLower; + } + else + { + assert((type == TYP_SIMD32) && (simdSize == 64)); + intrinsicId = NI_Vector512_GetLower; + } +#elif defined(TARGET_ARM64) + assert((type == TYP_SIMD8) && (simdSize == 16)); + intrinsicId = NI_Vector128_GetLower; +#else +#error Unsupported platform +#endif // !TARGET_XARCH && !TARGET_ARM64 + + assert(intrinsicId != NI_Illegal); + return gtNewSimdHWIntrinsicNode(type, op1, intrinsicId, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); +} + +GenTree* Compiler::gtNewSimdGetUpperNode( + var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic) +{ + var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + assert(varTypeIsArithmetic(simdBaseType)); + + NamedIntrinsic intrinsicId = NI_Illegal; + +#if defined(TARGET_XARCH) + if (simdSize == 32) + { + assert(type == TYP_SIMD16); + intrinsicId = NI_Vector256_GetUpper; + } + else + { + assert((type == TYP_SIMD32) && (simdSize == 64)); + intrinsicId = NI_Vector512_GetUpper; + } +#elif defined(TARGET_ARM64) + assert((type == TYP_SIMD8) && (simdSize == 16)); + intrinsicId = NI_Vector128_GetUpper; +#else +#error Unsupported platform +#endif // !TARGET_XARCH && !TARGET_ARM64 + + assert(intrinsicId != NI_Illegal); + return gtNewSimdHWIntrinsicNode(type, op1, intrinsicId, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); +} + //---------------------------------------------------------------------------------------------- // Compiler::gtNewSimdLoadNode: Creates a new simd Load node // @@ -22544,8 +22605,7 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types type, tmp1 = gtNewSimdHWIntrinsicNode(type, tmp1, NI_Vector128_ToVector256Unsafe, simdBaseJitType, 16, isSimdAsHWIntrinsic); - return gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, gtNewIconNode(1), NI_AVX_InsertVector128, - simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + return gtNewSimdWithUpperNode(type, tmp1, tmp2, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); } default: @@ -22765,8 +22825,7 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types type, tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector64_ToVector128Unsafe, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); - tmp2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, gtNewIconNode(1), op2, NI_AdvSimd_InsertScalar, - tmp2BaseJitType, 16, isSimdAsHWIntrinsic); + tmp2 = gtNewSimdWithUpperNode(TYP_SIMD16, tmp1, op2, tmp2BaseJitType, 16, isSimdAsHWIntrinsic); return gtNewSimdHWIntrinsicNode(type, tmp2, NI_AdvSimd_Arm64_ConvertToSingleLower, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); @@ -22782,8 +22841,7 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types type, tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector64_ToVector128Unsafe, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); - tmp2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, gtNewIconNode(1), op2, NI_AdvSimd_InsertScalar, - tmp2BaseJitType, 16, isSimdAsHWIntrinsic); + tmp2 = gtNewSimdWithUpperNode(TYP_SIMD16, tmp1, op2, tmp2BaseJitType, 16, isSimdAsHWIntrinsic); return gtNewSimdHWIntrinsicNode(type, tmp2, NI_AdvSimd_ExtractNarrowingLower, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); @@ -22935,8 +22993,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(var_types type, simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE; GenTree* op1Dup = fgMakeMultiUse(&op1, clsHnd); - GenTree* op1Lower = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector256_GetLower, simdBaseJitType, - simdSize, isSimdAsHWIntrinsic); + GenTree* op1Lower = gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); op2 = gtNewVconNode(TYP_SIMD16); op2->AsVecCon()->gtSimd16Val = vecCns.v128[0]; @@ -22944,8 +23001,8 @@ GenTree* Compiler::gtNewSimdShuffleNode(var_types type, op1Lower = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Lower, op2, NI_SSSE3_Shuffle, simdBaseJitType, 16, isSimdAsHWIntrinsic); - GenTree* op1Upper = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Dup, gtNewIconNode(1), NI_AVX_ExtractVector128, - simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + GenTree* op1Upper = + gtNewSimdGetUpperNode(TYP_SIMD16, op1Dup, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); op2 = gtNewVconNode(TYP_SIMD16); op2->AsVecCon()->gtSimd16Val = vecCns.v128[1]; @@ -22953,8 +23010,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(var_types type, op1Upper = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Upper, op2, NI_SSSE3_Shuffle, simdBaseJitType, 16, isSimdAsHWIntrinsic); - return gtNewSimdHWIntrinsicNode(type, op1Lower, op1Upper, gtNewIconNode(1), NI_AVX_InsertVector128, - simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + return gtNewSimdWithUpperNode(type, op1Lower, op1Upper, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); } if (elementSize == 4) @@ -23350,11 +23406,9 @@ GenTree* Compiler::gtNewSimdSumNode( intrinsic = (simdBaseType == TYP_FLOAT) ? NI_SSE_Add : NI_SSE2_Add; tmp = fgMakeMultiUse(&op1, clsHnd); - op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, gtNewIconNode(0x01, TYP_INT), NI_AVX_ExtractVector128, - simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + op1 = gtNewSimdGetUpperNode(TYP_SIMD16, op1, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); - tmp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp, NI_Vector256_GetLower, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); + tmp = gtNewSimdGetLowerNode(TYP_SIMD16, tmp, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, tmp, intrinsic, simdBaseJitType, 16, isSimdAsHWIntrinsic); } @@ -23551,8 +23605,7 @@ GenTree* Compiler::gtNewSimdWidenLowerNode( assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); - tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); + tmp1 = gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); switch (simdBaseType) { @@ -23652,8 +23705,7 @@ GenTree* Compiler::gtNewSimdWidenLowerNode( #elif defined(TARGET_ARM64) if (simdSize == 16) { - tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_Vector128_GetLower, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); + tmp1 = gtNewSimdGetLowerNode(TYP_SIMD8, op1, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); } else { @@ -23680,8 +23732,7 @@ GenTree* Compiler::gtNewSimdWidenLowerNode( if (simdSize == 8) { - tmp1 = - gtNewSimdHWIntrinsicNode(TYP_SIMD8, tmp1, NI_Vector128_GetLower, simdBaseJitType, 16, isSimdAsHWIntrinsic); + tmp1 = gtNewSimdGetLowerNode(TYP_SIMD8, tmp1, simdBaseJitType, 16, isSimdAsHWIntrinsic); } return tmp1; @@ -23714,8 +23765,7 @@ GenTree* Compiler::gtNewSimdWidenUpperNode( assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); - tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, gtNewIconNode(1), NI_AVX_ExtractVector128, simdBaseJitType, - simdSize, isSimdAsHWIntrinsic); + tmp1 = gtNewSimdGetUpperNode(TYP_SIMD16, op1, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); switch (simdBaseType) { @@ -23866,10 +23916,7 @@ GenTree* Compiler::gtNewSimdWidenUpperNode( tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); zero = gtNewZeroConNode(TYP_SIMD16); - tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, zero, gtNewIconNode(index), NI_AdvSimd_ExtractVector128, - simdBaseJitType, 16, isSimdAsHWIntrinsic); - return gtNewSimdHWIntrinsicNode(TYP_SIMD8, tmp1, NI_Vector128_GetLower, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); + return gtNewSimdGetUpperNode(TYP_SIMD8, tmp1, simdBaseJitType, 16, isSimdAsHWIntrinsic); } #else #error Unsupported platform @@ -23960,6 +24007,72 @@ GenTree* Compiler::gtNewSimdWithElementNode(var_types type, return gtNewSimdHWIntrinsicNode(type, op1, op2, op3, hwIntrinsicID, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); } +GenTree* Compiler::gtNewSimdWithLowerNode(var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic) +{ + var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + assert(varTypeIsArithmetic(simdBaseType)); + + NamedIntrinsic intrinsicId = NI_Illegal; + +#if defined(TARGET_XARCH) + if (simdSize == 32) + { + assert(type == TYP_SIMD32); + intrinsicId = NI_Vector256_WithLower; + } + else + { + assert((type == TYP_SIMD64) && (simdSize == 64)); + intrinsicId = NI_Vector512_WithLower; + } +#elif defined(TARGET_ARM64) + assert((type == TYP_SIMD16) && (simdSize == 16)); + intrinsicId = NI_Vector128_WithLower; +#else +#error Unsupported platform +#endif // !TARGET_XARCH && !TARGET_ARM64 + + return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsicId, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); +} + +GenTree* Compiler::gtNewSimdWithUpperNode(var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic) +{ + var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + assert(varTypeIsArithmetic(simdBaseType)); + + NamedIntrinsic intrinsicId = NI_Illegal; + +#if defined(TARGET_XARCH) + if (simdSize == 32) + { + assert(type == TYP_SIMD32); + intrinsicId = NI_Vector256_WithUpper; + } + else + { + assert((type == TYP_SIMD64) && (simdSize == 64)); + intrinsicId = NI_Vector512_WithUpper; + } +#elif defined(TARGET_ARM64) + assert((type == TYP_SIMD16) && (simdSize == 16)); + intrinsicId = NI_Vector128_WithUpper; +#else +#error Unsupported platform +#endif // !TARGET_XARCH && !TARGET_ARM64 + + return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsicId, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); +} + GenTreeHWIntrinsic* Compiler::gtNewScalarHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID) { return new (this, GT_HWINTRINSIC) GenTreeHWIntrinsic(type, getAllocator(CMK_ASTNode), hwIntrinsicID, diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 01f7a4420df1b..a09ca2d1e6d9b 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -424,7 +424,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, assert(retType == TYP_SIMD8); op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_Vector128_GetLower, simdBaseJitType, simdSize); + retNode = gtNewSimdGetLowerNode(TYP_SIMD8, op1, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ false); break; } @@ -1061,8 +1062,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impCloneExpr(op1, &op2, simdClsHnd, CHECK_SPILL_ALL, nullptr DEBUGARG("Clone op1 for vector extractmostsignificantbits")); - op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_Vector128_GetLower, simdBaseJitType, simdSize, - /* isSimdAsHWIntrinsic */ false); + op1 = gtNewSimdGetLowerNode(TYP_SIMD8, op1, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ false); op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, 8, /* isSimdAsHWIntrinsic */ false); op1 = gtNewSimdHWIntrinsicNode(simdBaseType, op1, NI_Vector64_ToScalar, simdBaseJitType, 8, @@ -1072,10 +1073,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, GenTree* zero = gtNewZeroConNode(TYP_SIMD16); ssize_t index = 8 / genTypeSize(simdBaseType); - op2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, zero, gtNewIconNode(index), NI_AdvSimd_ExtractVector128, - simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); - op2 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op2, NI_Vector128_GetLower, simdBaseJitType, simdSize, - /* isSimdAsHWIntrinsic */ false); + op2 = gtNewSimdGetUpperNode(TYP_SIMD8, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ false); op2 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op2, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, 8, /* isSimdAsHWIntrinsic */ false); op2 = gtNewSimdHWIntrinsicNode(simdBaseType, op2, NI_Vector64_ToScalar, simdBaseJitType, 8, @@ -1170,18 +1169,21 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector128_GetLower: + { + assert(sig->numArgs == 1); + + op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize)); + retNode = gtNewSimdGetLowerNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + break; + } + case NI_Vector128_GetUpper: { - // Converts to equivalent managed code: - // AdvSimd.ExtractVector128(vector, Vector128.Zero, 8 / sizeof(T)).GetLower(); - assert(numArgs == 1); - op1 = impPopStack().val; - GenTree* zero = gtNewZeroConNode(retType); - ssize_t index = 8 / genTypeSize(simdBaseType); + assert(sig->numArgs == 1); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, zero, gtNewIconNode(index), NI_AdvSimd_ExtractVector128, - simdBaseJitType, simdSize); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD8, retNode, NI_Vector128_GetLower, simdBaseJitType, 8); + op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize)); + retNode = gtNewSimdGetUpperNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); break; } @@ -1836,6 +1838,28 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector128_WithLower: + { + assert(sig->numArgs == 2); + + op2 = impSIMDPopStack(TYP_SIMD8); + op1 = impSIMDPopStack(TYP_SIMD16); + retNode = gtNewSimdWithLowerNode(retType, op1, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ false); + break; + } + + case NI_Vector128_WithUpper: + { + assert(sig->numArgs == 2); + + op2 = impSIMDPopStack(TYP_SIMD8); + op1 = impSIMDPopStack(TYP_SIMD16); + retNode = gtNewSimdWithUpperNode(retType, op1, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ false); + break; + } + case NI_Vector64_Xor: case NI_Vector128_Xor: case NI_Vector64_op_ExclusiveOr: diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index 406aa2fad5d36..7c0354959f3a7 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -955,6 +955,13 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_Vector128_GetUpper: + { + const int byteIndex = 8; + GetEmitter()->emitIns_R_R_R_I(ins, emitSize, targetReg, op1Reg, op1Reg, byteIndex, INS_OPTS_16B); + break; + } + case NI_Vector128_AsVector3: { // AsVector3 can be a no-op when it's already in the right register, otherwise diff --git a/src/coreclr/jit/hwintrinsiclistarm64.h b/src/coreclr/jit/hwintrinsiclistarm64.h index b9d1ab59b686d..a85ff2db6c53f 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64.h +++ b/src/coreclr/jit/hwintrinsiclistarm64.h @@ -162,8 +162,8 @@ HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, HARDWARE_INTRINSIC(Vector128, get_One, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, get_Zero, 16, 0, {INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(Vector128, GetLower, 16, 1, {INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov}, HW_Category_SIMD, HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(Vector128, GetUpper, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Vector128, GetLower, 16, 1, {INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov}, HW_Category_SIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Vector128, GetUpper, 16, 1, {INS_ext, INS_ext, INS_ext, INS_ext, INS_ext, INS_ext, INS_ext, INS_ext, INS_ext, INS_ext}, HW_Category_SIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Vector128, GreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, GreaterThanAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, GreaterThanAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) @@ -216,6 +216,8 @@ HARDWARE_INTRINSIC(Vector128, ToScalar, HARDWARE_INTRINSIC(Vector128, WidenLower, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, WidenUpper, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, WithElement, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Vector128, WithLower, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, WithUpper, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Xor, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index abe885e7cd5e4..6bba8a10224dd 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -176,7 +176,8 @@ HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, HARDWARE_INTRINSIC(Vector256, get_One, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, get_Zero, 32, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible) -HARDWARE_INTRINSIC(Vector256, GetLower, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector256, GetLower, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector256, GetUpper, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, GreaterThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Vector256, GreaterThanAll, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, GreaterThanAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) @@ -231,6 +232,8 @@ HARDWARE_INTRINSIC(Vector256, ToVector512Unsafe, HARDWARE_INTRINSIC(Vector256, WidenLower, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, WidenUpper, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, WithElement, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector256, WithLower, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector256, WithUpper, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, Xor, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -263,8 +266,9 @@ HARDWARE_INTRINSIC(Vector512, ExtractMostSignificantBits, HARDWARE_INTRINSIC(Vector512, get_AllBitsSet, 64, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Vector512, get_One, 64, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, get_Zero, 64, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Vector512, GetLower, 64, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(Vector512, GetLower128, 64, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector512, GetLower, 64, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector512, GetLower128, 64, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector512, GetUpper, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, Load, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, LoadAligned, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, LoadAlignedNonTemporal, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) @@ -278,6 +282,8 @@ HARDWARE_INTRINSIC(Vector512, Store, HARDWARE_INTRINSIC(Vector512, StoreAligned, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, StoreAlignedNonTemporal, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, StoreUnsafe, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, WithLower, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, WithUpper, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, Xor, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -772,6 +778,7 @@ HARDWARE_INTRINSIC(AVX2, Xor, HARDWARE_INTRINSIC(AVX512F, And, 64, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) +HARDWARE_INTRINSIC(AVX512F, ExtractVector256, 64, 2, {INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextractf64x4, INS_vextractf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinsertf64x4, INS_vinsertf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512, 64, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_vmovdqa64, INS_vmovdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512NonTemporal, 64, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 7512e3f222f2a..d6ad9c910b5e2 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1496,8 +1496,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, simdType = TYP_SIMD16; - op1 = gtNewSimdHWIntrinsicNode(simdType, op1, NI_Vector256_GetLower, simdBaseJitType, - simdSize); + op1 = gtNewSimdGetLowerNode(simdType, op1, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ false); simdSize = 16; } @@ -2417,22 +2417,58 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_ToVector256: case NI_Vector128_ToVector256Unsafe: + { + assert(sig->numArgs == 1); + assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); + + op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize)); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); + break; + } + case NI_Vector256_GetLower: { assert(sig->numArgs == 1); + assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); - if (compExactlyDependsOn(InstructionSet_AVX)) - { - op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize)); - retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); - } + op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize)); + retNode = gtNewSimdGetLowerNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + break; + } + + case NI_Vector256_GetUpper: + { + assert(sig->numArgs == 1); + assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); + + op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize)); + retNode = gtNewSimdGetUpperNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + break; + } + + case NI_Vector512_GetLower: + { + assert(sig->numArgs == 1); + assert(IsBaselineVector512IsaSupportedDebugOnly()); + + op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize)); + retNode = gtNewSimdGetLowerNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + break; + } + + case NI_Vector512_GetUpper: + { + assert(sig->numArgs == 1); + assert(IsBaselineVector512IsaSupportedDebugOnly()); + + op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize)); + retNode = gtNewSimdGetUpperNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); break; } case NI_Vector128_ToVector512: case NI_Vector256_ToVector512: case NI_Vector256_ToVector512Unsafe: - case NI_Vector512_GetLower: case NI_Vector512_GetLower128: { assert(sig->numArgs == 1); @@ -2536,6 +2572,54 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector256_WithLower: + { + assert(sig->numArgs == 2); + assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); + + op2 = impSIMDPopStack(TYP_SIMD16); + op1 = impSIMDPopStack(TYP_SIMD32); + retNode = gtNewSimdWithLowerNode(retType, op1, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ false); + break; + } + + case NI_Vector256_WithUpper: + { + assert(sig->numArgs == 2); + assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); + + op2 = impSIMDPopStack(TYP_SIMD16); + op1 = impSIMDPopStack(TYP_SIMD32); + retNode = gtNewSimdWithUpperNode(retType, op1, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ false); + break; + } + + case NI_Vector512_WithLower: + { + assert(sig->numArgs == 2); + assert(IsBaselineVector512IsaSupportedDebugOnly()); + + op2 = impSIMDPopStack(TYP_SIMD32); + op1 = impSIMDPopStack(TYP_SIMD64); + retNode = gtNewSimdWithLowerNode(retType, op1, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ false); + break; + } + + case NI_Vector512_WithUpper: + { + assert(sig->numArgs == 2); + assert(IsBaselineVector512IsaSupportedDebugOnly()); + + op2 = impSIMDPopStack(TYP_SIMD32); + op1 = impSIMDPopStack(TYP_SIMD64); + retNode = gtNewSimdWithUpperNode(retType, op1, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ false); + break; + } + case NI_Vector128_Xor: case NI_Vector256_Xor: case NI_Vector512_Xor: diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 150c76d49033e..3253471c94550 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -618,6 +618,8 @@ INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // AVX512F +INST3(vextractf64x4, "extractf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1B), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed double-precision floating point values +INST3(vextracti64x4, "extracti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3B), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed quadword integer values INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values INST3(vmovdqa64, "movdqa64", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_FLAGS_None) @@ -635,6 +637,8 @@ INST3(vpmovb2m, "pmovb2m", IUM_WR, BAD_CODE, BAD_CODE, INST3(vpmovw2m, "pmovw2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x29), INS_TT_NONE, Input_16Bit | REX_W1_EVEX | Encoding_EVEX) // AVX512DQ +INST3(vextractf32x8, "extractf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1B), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed double-precision floating point values +INST3(vextracti32x8, "extracti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3B), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed quadword integer values INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values INST3(vpmovd2m, "pmovd2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x39), INS_TT_NONE, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index a2727d63cf451..3f603fcf95437 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1110,6 +1110,28 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) return LowerHWIntrinsicCmpOp(node, GT_NE); } + case NI_Vector128_WithLower: + case NI_Vector128_WithUpper: + { + // Converts to equivalent managed code: + // AdvSimd.InsertScalar(vector.AsUInt64(), 0, value.AsUInt64()).As(); + // -or- + // AdvSimd.InsertScalar(vector.AsUInt64(), 1, value.AsUInt64()).As(); + + int index = (intrinsicId == NI_Vector128_WithUpper) ? 1 : 0; + + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); + + GenTree* op3 = comp->gtNewIconNode(index); + BlockRange().InsertBefore(node, op3); + LowerNode(op3); + + node->SetSimdBaseJitType(CORINFO_TYPE_ULONG); + node->ResetHWIntrinsicId(NI_AdvSimd_InsertScalar, comp, op1, op3, op2); + break; + } + case NI_AdvSimd_FusedMultiplyAddScalar: LowerHWIntrinsicFusedMultiplyAddScalar(node); break; diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index f7b087b9db07b..8e323fa3e1f98 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1096,12 +1096,100 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) return node->gtNext; } + case NI_Vector256_GetUpper: + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); + var_types simdBaseType = node->GetSimdBaseType(); + + if (varTypeIsFloating(simdBaseType) || !comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + { + intrinsicId = NI_AVX_ExtractVector128; + } + else + { + intrinsicId = NI_AVX2_ExtractVector128; + } + + GenTree* op1 = node->Op(1); + + GenTree* op2 = comp->gtNewIconNode(1); + BlockRange().InsertBefore(node, op2); + LowerNode(op2); + + node->ResetHWIntrinsicId(intrinsicId, comp, op1, op2); + break; + } + + case NI_Vector512_GetUpper: + { + assert(comp->IsBaselineVector512IsaSupportedDebugOnly()); + var_types simdBaseType = node->GetSimdBaseType(); + + intrinsicId = NI_AVX512F_ExtractVector256; + + GenTree* op1 = node->Op(1); + + GenTree* op2 = comp->gtNewIconNode(1); + BlockRange().InsertBefore(node, op2); + LowerNode(op2); + + node->ResetHWIntrinsicId(intrinsicId, comp, op1, op2); + break; + } + case NI_Vector128_WithElement: case NI_Vector256_WithElement: { return LowerHWIntrinsicWithElement(node); } + case NI_Vector256_WithLower: + case NI_Vector256_WithUpper: + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); + var_types simdBaseType = node->GetSimdBaseType(); + int index = (intrinsicId == NI_Vector256_WithUpper) ? 1 : 0; + + if (varTypeIsFloating(simdBaseType) || !comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + { + intrinsicId = NI_AVX_InsertVector128; + } + else + { + intrinsicId = NI_AVX2_InsertVector128; + } + + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); + + GenTree* op3 = comp->gtNewIconNode(index); + BlockRange().InsertBefore(node, op3); + LowerNode(op3); + + node->ResetHWIntrinsicId(intrinsicId, comp, op1, op2, op3); + break; + } + + case NI_Vector512_WithLower: + case NI_Vector512_WithUpper: + { + assert(comp->IsBaselineVector512IsaSupportedDebugOnly()); + var_types simdBaseType = node->GetSimdBaseType(); + int index = (intrinsicId == NI_Vector512_WithUpper) ? 1 : 0; + + intrinsicId = NI_AVX512F_InsertVector256; + + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); + + GenTree* op3 = comp->gtNewIconNode(index); + BlockRange().InsertBefore(node, op3); + LowerNode(op3); + + node->ResetHWIntrinsicId(intrinsicId, comp, op1, op2, op3); + break; + } + case NI_Vector128_op_Equality: case NI_Vector256_op_Equality: { @@ -2220,14 +2308,13 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // idx = CNS_INT int 0 // /--* tmp3 simd32 // +--* tmp1 simd16 - // +--* idx int - // node = * HWINTRINSIC simd32 T InsertVector128 + // node = * HWINTRINSIC simd32 T WithUpper // This is roughly the following managed code: // var tmp1 = Vector128.Create(op1); // var tmp2 = tmp1; // var tmp3 = tmp2.ToVector256Unsafe(); - // return Avx.InsertVector128(tmp3, tmp1, 0x01); + // return tmp3.WithUpper(tmp1); tmp1 = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, op1, simdBaseJitType, 16, false); BlockRange().InsertAfter(op1, tmp1); @@ -2246,10 +2333,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, tmp2, NI_Vector128_ToVector256Unsafe, simdBaseJitType, 16); BlockRange().InsertAfter(tmp2, tmp3); - idx = comp->gtNewIconNode(0x01, TYP_INT); - BlockRange().InsertAfter(tmp3, idx); - - node->ResetHWIntrinsicId(NI_AVX_InsertVector128, comp, tmp3, tmp1, idx); + node->ResetHWIntrinsicId(NI_Vector256_WithUpper, comp, tmp3, tmp1); LowerNode(tmp3); return LowerNode(node); @@ -2574,17 +2658,15 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // /--* ... T // +--* opN T // hi = * HWINTRINSIC simd32 T Create - // idx = CNS_INT int 1 // /--* lo simd64 // +--* hi simd32 - // +--* idx int - // node = * HWINTRINSIC simd64 T InsertVector256 + // node = * HWINTRINSIC simd64 T WithUpper // This is roughly the following managed code: // ... // var lo = Vector256.Create(op1, ...); // var hi = Vector256.Create(..., opN); - // return Avx512F.InsertVector512(lo, hi, 0x01); + // return lo.WithUpper(hi); // Each Vector256.Create call gets half the operands. That is: // lo = Vector256.Create(op1, op2); @@ -2610,11 +2692,8 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) NI_Vector256_Create, simdBaseJitType, 32); BlockRange().InsertAfter(node->Op(argCnt), hi); - idx = comp->gtNewIconNode(0x01, TYP_INT); - BlockRange().InsertAfter(hi, idx); - assert(argCnt >= 7); - node->ResetHWIntrinsicId(NI_AVX512F_InsertVector256, comp, lo, hi, idx); + node->ResetHWIntrinsicId(NI_Vector512_WithUpper, comp, lo, hi); LowerNode(lo); LowerNode(hi); @@ -2632,17 +2711,15 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // /--* ... T // +--* opN T // hi = * HWINTRINSIC simd16 T Create - // idx = CNS_INT int 1 // /--* lo simd32 // +--* hi simd16 - // +--* idx int - // node = * HWINTRINSIC simd32 T InsertVector128 + // node = * HWINTRINSIC simd32 T WithUpper // This is roughly the following managed code: // ... // var lo = Vector128.Create(op1, ...); // var hi = Vector128.Create(..., opN); - // return Avx.InsertVector128(lo, hi, 0x01); + // return lo.WithUpper(hi); // Each Vector128.Create call gets half the operands. That is: // lo = Vector128.Create(op1, op2); @@ -2668,11 +2745,8 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) NI_Vector128_Create, simdBaseJitType, 16); BlockRange().InsertAfter(node->Op(argCnt), hi); - idx = comp->gtNewIconNode(0x01, TYP_INT); - BlockRange().InsertAfter(hi, idx); - assert(argCnt >= 3); - node->ResetHWIntrinsicId(NI_AVX_InsertVector128, comp, lo, hi, idx); + node->ResetHWIntrinsicId(NI_Vector256_WithUpper, comp, lo, hi); LowerNode(lo); LowerNode(hi); @@ -3202,23 +3276,17 @@ void Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node) if (imm8 >= count / 2) { - // idx = CNS_INT int 1 - // /--* op1 simd32 - // +--* idx int - // op1 = * HWINTRINSIC simd32 T ExtractVector128 + // /--* op1 simd32 + // tmp1 = * HWINTRINSIC simd32 T GetUpper // This is roughly the following managed code: // ... - // op1 = Avx.ExtractVector128(op1, 0x01); + // tmp1 = op1.GetUpper(); imm8 -= count / 2; - idx = comp->gtNewIconNode(1); - BlockRange().InsertBefore(node, idx); - - tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, idx, NI_AVX_ExtractVector128, simdBaseJitType, - simdSize); - BlockRange().InsertAfter(idx, tmp1); + tmp1 = comp->gtNewSimdGetUpperNode(TYP_SIMD16, op1, simdBaseJitType, simdSize, false); + BlockRange().InsertBefore(node, tmp1); LowerNode(tmp1); } else @@ -3230,7 +3298,7 @@ void Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node) // ... // op1 = op1.GetLower(); - tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize); + tmp1 = comp->gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseJitType, simdSize, false); BlockRange().InsertBefore(node, tmp1); LowerNode(tmp1); } @@ -3439,23 +3507,18 @@ GenTree* Lowering::LowerHWIntrinsicWithElement(GenTreeHWIntrinsic* node) { // We will be constructing the following parts: // ... - // idx = CNS_INT int 1 + // /--* op1 simd32 - // +--* idx int - // op1 = * HWINTRINSIC simd32 T ExtractVector128 + // tmp1 = * HWINTRINSIC simd32 T GetUpper // This is roughly the following managed code: // ... - // op1 = Avx.ExtractVector128(op1, 0x01); + // tmp1 = op1.GetUpper(); imm8 -= count / 2; - idx = comp->gtNewIconNode(1); - BlockRange().InsertAfter(op1, idx); - - tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, idx, NI_AVX_ExtractVector128, simdBaseJitType, - simdSize); - BlockRange().InsertAfter(idx, tmp1); + tmp1 = comp->gtNewSimdGetUpperNode(TYP_SIMD16, op1, simdBaseJitType, simdSize, false); + BlockRange().InsertAfter(op1, tmp1); LowerNode(tmp1); } else @@ -3469,7 +3532,7 @@ GenTree* Lowering::LowerHWIntrinsicWithElement(GenTreeHWIntrinsic* node) // ... // op1 = op1.GetLower(); - tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize); + tmp1 = comp->gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseJitType, simdSize, false); BlockRange().InsertAfter(op1, tmp1); LowerNode(tmp1); } @@ -3477,9 +3540,14 @@ GenTree* Lowering::LowerHWIntrinsicWithElement(GenTreeHWIntrinsic* node) op1 = tmp1; // Now we will insert our "result" into our simd32 temporary. - idx = comp->gtNewIconNode((cachedImm8 >= count / 2) ? 1 : 0); - BlockRange().InsertBefore(node, idx); - node->ChangeHWIntrinsicId(NI_AVX_InsertVector128, tmp32, result, idx); + if (cachedImm8 >= count / 2) + { + node->ResetHWIntrinsicId(NI_Vector256_WithUpper, comp, tmp32, result); + } + else + { + node->ResetHWIntrinsicId(NI_Vector256_WithLower, comp, tmp32, result); + } } switch (simdBaseType) @@ -3677,7 +3745,8 @@ GenTree* Lowering::LowerHWIntrinsicWithElement(GenTreeHWIntrinsic* node) { // Now that we have finalized the shape of the tree, lower the insertion node as well. - assert(node->GetHWIntrinsicId() == NI_AVX_InsertVector128); + assert((node->GetHWIntrinsicId() == NI_Vector256_WithLower) || + (node->GetHWIntrinsicId() == NI_Vector256_WithUpper)); assert(node != result); nextNode = LowerNode(node); @@ -3756,10 +3825,8 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) // /--* tmp1 simd32 // tmp1 = * HWINTRINSIC simd16 T GetLower // tmp2 = LCL_VAR simd32 - // idx = CNS_INT int 0x01 // /--* tmp2 simd16 - // +--* idx int - // tmp2 = * HWINTRINSIC simd16 T ExtractVector128 + // tmp2 = * HWINTRINSIC simd16 T GetUpper // /--* tmp1 simd16 // +--* tmp2 simd16 // tmp3 = * HWINTRINSIC simd16 T Add @@ -3768,7 +3835,7 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) // This is roughly the following managed code: // var tmp1 = Avx.DotProduct(op1, op2, 0xFF); - // var tmp2 = Avx.ExtractVector128(tmp1, 0x01); + // var tmp2 = tmp1.GetUpper(); // var tmp3 = Sse.Add(tmp1, tmp2); // return tmp3.ToScalar(); @@ -3788,26 +3855,21 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) tmp2 = comp->gtClone(tmp1); BlockRange().InsertAfter(tmp1, tmp2); - idx = comp->gtNewIconNode(0x01, TYP_INT); - BlockRange().InsertAfter(tmp2, idx); - - tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp2, idx, NI_AVX_ExtractVector128, simdBaseJitType, - simdSize); - BlockRange().InsertAfter(idx, tmp2); - LowerNode(tmp2); + tmp3 = comp->gtNewSimdGetUpperNode(TYP_SIMD16, tmp2, simdBaseJitType, simdSize, false); + BlockRange().InsertAfter(tmp2, tmp3); + LowerNode(tmp3); - tmp1 = - comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, NI_Vector256_GetLower, simdBaseJitType, simdSize); - BlockRange().InsertAfter(tmp2, tmp1); + tmp1 = comp->gtNewSimdGetLowerNode(TYP_SIMD16, tmp1, simdBaseJitType, simdSize, false); + BlockRange().InsertAfter(tmp3, tmp1); LowerNode(tmp1); - tmp3 = comp->gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, tmp2, tmp1, simdBaseJitType, 16, false); - BlockRange().InsertAfter(tmp1, tmp3); - LowerNode(tmp3); + tmp2 = comp->gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, tmp3, tmp1, simdBaseJitType, 16, false); + BlockRange().InsertAfter(tmp1, tmp2); + LowerNode(tmp2); node->SetSimdSize(16); - node->ResetHWIntrinsicId(NI_Vector128_ToScalar, tmp3); + node->ResetHWIntrinsicId(NI_Vector128_ToScalar, tmp2); return LowerNode(node); } @@ -4291,19 +4353,17 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) // /--* tmp1 simd32 // tmp1 = * HWINTRINSIC simd16 T GetLower // tmp2 = LCL_VAR simd32 - // idx = CNS_INT int 0x01 // /--* tmp2 simd32 - // +--* idx int - // tmp2 = * HWINTRINSIC simd16 T ExtractVector128 + // tmp3 = * HWINTRINSIC simd16 T GetUpper // /--* tmp1 simd16 - // +--* tmp2 simd16 + // +--* tmp3 simd16 // tmp1 = * HWINTRINSIC simd16 T Add // ... // This is roughly the following managed code: // ... // var tmp2 = tmp1; - // tmp2 = Avx.ExtractVector128(tmp2, 0x01); + // tmp3 = tmp2.GetUpper(); // var tmp1 = Isa.Add(tmp1.GetLower(), tmp2); // ... @@ -4317,24 +4377,20 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) tmp2 = comp->gtClone(tmp1); BlockRange().InsertAfter(tmp1, tmp2); - idx = comp->gtNewIconNode(0x01, TYP_INT); - BlockRange().InsertAfter(tmp2, idx); - - tmp2 = - comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp2, idx, NI_AVX_ExtractVector128, simdBaseJitType, simdSize); - BlockRange().InsertAfter(idx, tmp2); - LowerNode(tmp2); + tmp3 = comp->gtNewSimdGetUpperNode(TYP_SIMD16, tmp2, simdBaseJitType, simdSize, false); + BlockRange().InsertAfter(tmp2, tmp3); + LowerNode(tmp3); - tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, NI_Vector256_GetLower, simdBaseJitType, simdSize); - BlockRange().InsertAfter(tmp2, tmp1); + tmp1 = comp->gtNewSimdGetLowerNode(TYP_SIMD16, tmp1, simdBaseJitType, simdSize, false); + BlockRange().InsertAfter(tmp3, tmp1); LowerNode(tmp1); - tmp3 = comp->gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, tmp2, tmp1, simdBaseJitType, 16, false); - BlockRange().InsertAfter(tmp1, tmp3); - LowerNode(tmp3); + tmp2 = comp->gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, tmp3, tmp1, simdBaseJitType, 16, false); + BlockRange().InsertAfter(tmp1, tmp2); + LowerNode(tmp2); node->SetSimdSize(16); - tmp1 = tmp3; + tmp1 = tmp2; } if (varTypeIsSIMD(node->gtType)) @@ -5503,6 +5559,7 @@ void Lowering::ContainCheckStoreIndir(GenTreeStoreInd* node) case NI_SSE41_X64_Extract: case NI_AVX_ExtractVector128: case NI_AVX2_ExtractVector128: + case NI_AVX512F_ExtractVector256: { // These intrinsics are "ins reg/mem, xmm, imm8" @@ -6599,6 +6656,20 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre break; } + case NI_AVX512F_InsertVector256: + { + // InsertVector256 is special in that it returns a TYP_SIMD64 but takes a TYP_SIMD32. + assert(!supportsSIMDScalarLoads); + + const unsigned expectedSize = 32; + const unsigned operandSize = genTypeSize(childNode->TypeGet()); + + supportsAlignedSIMDLoads = !comp->canUseEvexEncoding() || !comp->opts.MinOpts(); + supportsUnalignedSIMDLoads = comp->canUseEvexEncoding(); + supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize); + break; + } + case NI_SSE2_Insert: case NI_SSE41_Insert: case NI_SSE41_X64_Insert: @@ -6885,6 +6956,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_Vector128_GetElement: case NI_AVX_ExtractVector128: case NI_AVX2_ExtractVector128: + case NI_AVX512F_ExtractVector256: { // These are only containable as part of a store return false; @@ -7176,6 +7248,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_SSE2_Extract: case NI_AVX_ExtractVector128: case NI_AVX2_ExtractVector128: + case NI_AVX512F_ExtractVector256: { // These intrinsics are "ins reg/mem, xmm, imm8" and get // contained by the relevant store operation instead. diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index a8853d950e1cd..7ca39c664c5ff 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -3144,22 +3144,16 @@ public static Vector128 WithElement(this Vector128 vector, int index, T /// The value of the lower 64-bits as a . /// A new with the lower 64-bits set to and the upper 64-bits set to the same value as that in . /// The type of () is not supported. + [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 WithLower(this Vector128 vector, Vector64 value) where T : struct { ThrowHelper.ThrowForUnsupportedIntrinsicsVector128BaseType(); - if (AdvSimd.IsSupported) - { - return AdvSimd.InsertScalar(vector.AsUInt64(), 0, value.AsUInt64()).As(); - } - else - { - Vector128 result = vector; - result.SetLowerUnsafe(value); - return result; - } + Vector128 result = vector; + result.SetLowerUnsafe(value); + return result; } /// Creates a new with the upper 64-bits set to the specified value and the upper 64-bits set to the same value as that in the given vector. @@ -3168,22 +3162,16 @@ public static Vector128 WithLower(this Vector128 vector, Vector64 va /// The value of the upper 64-bits as a . /// A new with the upper 64-bits set to and the lower 64-bits set to the same value as that in . /// The type of () is not supported. + [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 WithUpper(this Vector128 vector, Vector64 value) where T : struct { ThrowHelper.ThrowForUnsupportedIntrinsicsVector128BaseType(); - if (AdvSimd.IsSupported) - { - return AdvSimd.InsertScalar(vector.AsUInt64(), 1, value.AsUInt64()).As(); - } - else - { - Vector128 result = vector; - result.SetUpperUnsafe(value); - return result; - } + Vector128 result = vector; + result.SetUpperUnsafe(value); + return result; } /// Computes the exclusive-or of two vectors. diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs index 340d20b081232..ec5555516de4f 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs @@ -1522,27 +1522,13 @@ public static Vector128 GetLower(this Vector256 vector) /// The vector to get the upper 128-bits from. /// The value of the upper 128-bits as a new . /// The type of () is not supported. + [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 GetUpper(this Vector256 vector) where T : struct { ThrowHelper.ThrowForUnsupportedIntrinsicsVector256BaseType(); - - if (Avx2.IsSupported && ((typeof(T) != typeof(float)) && (typeof(T) != typeof(double)))) - { - // All integral types generate the same instruction, so just pick one rather than handling each T separately - return Avx2.ExtractVector128(vector.AsByte(), 1).As(); - } - else if (Avx.IsSupported) - { - // All floating-point types generate the same instruction, so just pick one rather than handling each T separately - // We also just fallback to this for integral types if AVX2 isn't supported, since that is still faster than software - return Avx.ExtractVector128(vector.AsSingle(), 1).As(); - } - else - { - return vector._upper; - } + return vector._upper; } /// Compares two vectors to determine which is greater on a per-element basis. @@ -3105,29 +3091,16 @@ public static Vector256 WithElement(this Vector256 vector, int index, T /// The value of the lower 128-bits as a . /// A new with the lower 128-bits set to and the upper 128-bits set to the same value as that in . /// The type of () is not supported. + [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 WithLower(this Vector256 vector, Vector128 value) where T : struct { ThrowHelper.ThrowForUnsupportedIntrinsicsVector256BaseType(); - if (Avx2.IsSupported && ((typeof(T) != typeof(float)) && (typeof(T) != typeof(double)))) - { - // All integral types generate the same instruction, so just pick one rather than handling each T separately - return Avx2.InsertVector128(vector.AsByte(), value.AsByte(), 0).As(); - } - else if (Avx.IsSupported) - { - // All floating-point types generate the same instruction, so just pick one rather than handling each T separately - // We also just fallback to this for integral types if AVX2 isn't supported, since that is still faster than software - return Avx.InsertVector128(vector.AsSingle(), value.AsSingle(), 0).As(); - } - else - { - Vector256 result = vector; - result.SetLowerUnsafe(value); - return result; - } + Vector256 result = vector; + result.SetLowerUnsafe(value); + return result; } /// Creates a new with the upper 128-bits set to the specified value and the upper 128-bits set to the same value as that in the given vector. @@ -3136,29 +3109,16 @@ public static Vector256 WithLower(this Vector256 vector, Vector128 v /// The value of the upper 128-bits as a . /// A new with the upper 128-bits set to and the lower 128-bits set to the same value as that in . /// The type of () is not supported. + [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 WithUpper(this Vector256 vector, Vector128 value) where T : struct { ThrowHelper.ThrowForUnsupportedIntrinsicsVector256BaseType(); - if (Avx2.IsSupported && ((typeof(T) != typeof(float)) && (typeof(T) != typeof(double)))) - { - // All integral types generate the same instruction, so just pick one rather than handling each T separately - return Avx2.InsertVector128(vector.AsByte(), value.AsByte(), 1).As(); - } - else if (Avx.IsSupported) - { - // All floating-point types generate the same instruction, so just pick one rather than handling each T separately - // We also just fallback to this for integral types if AVX2 isn't supported, since that is still faster than software - return Avx.InsertVector128(vector.AsSingle(), value.AsSingle(), 1).As(); - } - else - { - Vector256 result = vector; - result.SetUpperUnsafe(value); - return result; - } + Vector256 result = vector; + result.SetUpperUnsafe(value); + return result; } /// Computes the exclusive-or of two vectors. diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs index 9b7be1d777fb6..26615fe12df2d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs @@ -1575,6 +1575,7 @@ public static Vector256 GetLower(this Vector512 vector) /// The vector to get the upper 256-bits from. /// The value of the upper 256-bits as a new . /// The type of () is not supported. + [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 GetUpper(this Vector512 vector) where T : struct @@ -3093,6 +3094,7 @@ public static Vector512 WithElement(this Vector512 vector, int index, T /// The value of the lower 256-bits as a . /// A new with the lower 256-bits set to and the upper 256-bits set to the same value as that in . /// The type of () is not supported. + [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector512 WithLower(this Vector512 vector, Vector256 value) where T : struct @@ -3110,6 +3112,7 @@ public static Vector512 WithLower(this Vector512 vector, Vector256 v /// The value of the upper 256-bits as a . /// A new with the upper 256-bits set to and the lower 256-bits set to the same value as that in . /// The type of () is not supported. + [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector512 WithUpper(this Vector512 vector, Vector256 value) where T : struct