diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 94d67a06ace08..070ce82e6ec22 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -780,6 +780,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #endif void genSetRegToConst(regNumber targetReg, var_types targetType, GenTree* tree); +#if defined(FEATURE_SIMD) + void genSetRegToConst(regNumber targetReg, var_types targetType, simd_t* val); +#endif void genCodeForTreeNode(GenTree* treeNode); void genCodeForBinary(GenTreeOp* treeNode); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index c0b2404cefb58..1988aa632cf35 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -417,6 +417,123 @@ void CodeGen::instGen_Set_Reg_To_Imm(emitAttr size, regSet.verifyRegUsed(reg); } +#if defined(FEATURE_SIMD) +//---------------------------------------------------------------------------------- +// genSetRegToConst: generate code to set target SIMD register to a given constant value +// +// Arguments: +// targetReg - target SIMD register +// targetType - target's type +// simd_t - constant data (its width depends on type) +// +void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t* val) +{ + emitter* emit = GetEmitter(); + emitAttr attr = emitTypeSize(targetType); + + switch (targetType) + { + case TYP_SIMD8: + { + simd8_t val8 = *(simd8_t*)val; + if (val8.IsAllBitsSet()) + { + emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg); + } + else if (val8.IsZero()) + { + emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg); + } + else + { + CORINFO_FIELD_HANDLE hnd = emit->emitSimd8Const(val8); + emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + } + break; + } + case TYP_SIMD12: + { + simd12_t val12 = *(simd12_t*)val; + if (val12.IsAllBitsSet()) + { + emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg); + } + else if (val12.IsZero()) + { + emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg); + } + else + { + simd16_t val16 = {}; + memcpy(&val16, &val12, sizeof(val12)); + CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16); + emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + } + break; + } + case TYP_SIMD16: + { + simd16_t val16 = *(simd16_t*)val; + if (val16.IsAllBitsSet()) + { + emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg); + } + else if (val16.IsZero()) + { + emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); + } + else + { + CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16); + emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + } + break; + } + case TYP_SIMD32: + { + simd32_t val32 = *(simd32_t*)val; + if (val32.IsAllBitsSet() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + { + emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg); + } + else if (val32.IsZero()) + { + emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); + } + else + { + CORINFO_FIELD_HANDLE hnd = emit->emitSimd32Const(val32); + emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + } + break; + } + case TYP_SIMD64: + { + simd64_t val64 = *(simd64_t*)val; + if (val64.IsAllBitsSet() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg, + static_cast(0xFF)); + } + else if (val64.IsZero()) + { + emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); + } + else + { + CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(val64); + emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + } + break; + } + default: + { + unreached(); + } + } +} +#endif // FEATURE_SIMD + /*********************************************************************************** * * Generate code to set a register 'targetReg' of type 'targetType' to the constant @@ -484,147 +601,12 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre case GT_CNS_VEC: { - GenTreeVecCon* vecCon = tree->AsVecCon(); - - emitter* emit = GetEmitter(); - emitAttr attr = emitTypeSize(targetType); - - if (vecCon->IsAllBitsSet()) - { - switch (attr) - { - case EA_8BYTE: - case EA_16BYTE: - { - emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg); - return; - } - #if defined(FEATURE_SIMD) - case EA_32BYTE: - { - if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2)) - { - emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg); - return; - } - break; - } - - case EA_64BYTE: - { - assert(compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F)); - emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg, - static_cast(0xFF)); - return; - } -#endif // FEATURE_SIMD - - default: - { - unreached(); - } - } - } - - if (vecCon->IsZero()) - { - switch (attr) - { - case EA_8BYTE: - case EA_16BYTE: - { - emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg); - return; - } - - case EA_32BYTE: - { - if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)) - { - emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); - return; - } - break; - } - - case EA_64BYTE: - { - if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) - { - emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); - return; - } - break; - } - - default: - { - unreached(); - } - } - } - - switch (tree->TypeGet()) - { -#if defined(FEATURE_SIMD) - case TYP_SIMD8: - { - simd8_t constValue; - memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd8_t)); - - CORINFO_FIELD_HANDLE hnd = emit->emitSimd8Const(constValue); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); - break; - } - - case TYP_SIMD12: - { - simd16_t constValue = {}; - memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd12_t)); - - CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(constValue); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); - break; - } - - case TYP_SIMD16: - { - simd16_t constValue; - memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd16_t)); - - CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(constValue); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); - break; - } - - case TYP_SIMD32: - { - simd32_t constValue; - memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd32_t)); - - CORINFO_FIELD_HANDLE hnd = emit->emitSimd32Const(constValue); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); - break; - } - - case TYP_SIMD64: - { - simd64_t constValue; - memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd64_t)); - - CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(constValue); - emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); - break; - } -#endif // FEATURE_SIMD - - default: - { - unreached(); - } - } - + GenTreeVecCon* vecCon = tree->AsVecCon(); + genSetRegToConst(vecCon->GetRegNum(), targetType, &vecCon->gtSimdVal); +#else + unreached(); +#endif break; } @@ -3134,17 +3116,8 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) // INITBLK zeroes a struct that contains GC pointers and can be observed by // other threads (i.e. when dstAddr is not an address of a local). // For example, this can happen when initializing a struct field of an object. - const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences(); - -#ifdef TARGET_AMD64 - // On Amd64 the JIT will not use SIMD stores for such structs and instead - // will always allocate a GP register for src node. - const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES); -#else - // On X86 the JIT will use movq for structs that are larger than 16 bytes - // since it is more beneficial than using two mov-s from a GP register. - const bool willUseSimdMov = (size >= 16); -#endif + const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences() && compiler->IsBaselineSimdIsaSupported(); + const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES); if (!src->isContained()) { @@ -3152,14 +3125,8 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) } else { - // If src is contained then it must be 0. - assert(src->IsIntegralConst(0)); assert(willUseSimdMov); -#ifdef TARGET_AMD64 assert(size >= XMM_REGSIZE_BYTES); -#else - assert(size % 8 == 0); -#endif } emitter* emit = GetEmitter(); @@ -3167,57 +3134,20 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) assert(size <= INT32_MAX); assert(dstOffset < (INT32_MAX - static_cast(size))); +#ifdef FEATURE_SIMD if (willUseSimdMov) { regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT); - - unsigned regSize = compiler->roundDownSIMDSize(size); + unsigned regSize = compiler->roundDownSIMDSize(size); if (size < ZMM_RECOMMENDED_THRESHOLD) { // Involve ZMM only for large data due to possible downclocking. regSize = min(regSize, YMM_REGSIZE_BYTES); } - - bool zeroing = false; - if (src->gtSkipReloadOrCopy()->IsIntegralConst(0)) - { - // If the source is constant 0 then always use xorps, it's faster - // than copying the constant from a GPR to a XMM register. - emit->emitIns_SIMD_R_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg, srcXmmReg); - zeroing = true; - } - else - { - // TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk. - regSize = min(regSize, YMM_REGSIZE_BYTES); - - if (compiler->compOpportunisticallyDependsOn(InstructionSet_Vector512)) - { - emit->emitIns_R_R(INS_vpbroadcastd_gpr, EA_ATTR(regSize), srcXmmReg, srcIntReg); - } - else if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2)) - { - emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false); - emit->emitIns_R_R(INS_vpbroadcastd, EA_ATTR(regSize), srcXmmReg, srcXmmReg); - } - else - { - emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false); - - emit->emitIns_SIMD_R_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg, srcXmmReg); - -#ifdef TARGET_X86 - // For x86, we need one more to convert it from 8 bytes to 16 bytes. - emit->emitIns_SIMD_R_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg, srcXmmReg); -#endif - - if (regSize == YMM_REGSIZE_BYTES) - { - // Extend the bytes in the lower lanes to the upper lanes - emit->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, srcXmmReg, srcXmmReg, srcXmmReg, 1); - } - } - } + var_types loadType = compiler->getSIMDTypeForSize(regSize); + simd_t vecCon; + memset(&vecCon, (uint8_t)src->AsIntCon()->IconValue(), sizeof(simd_t)); + genSetRegToConst(srcXmmReg, loadType, &vecCon); instruction simdMov = simdUnalignedMovIns(); unsigned bytesWritten = 0; @@ -3236,55 +3166,32 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) while (bytesWritten < size) { -#ifdef TARGET_X86 - if (!canUse16BytesSimdMov || (bytesWritten + regSize > size)) - { - simdMov = INS_movq; - regSize = 8; - } -#endif if (bytesWritten + regSize > size) { + // We have a remainder that is smaller than regSize. break; } emitSimdMovs(); dstOffset += regSize; bytesWritten += regSize; - - if (!zeroing) - { - assert(regSize <= YMM_REGSIZE_BYTES); - } - - if (!zeroing && regSize == YMM_REGSIZE_BYTES && size - bytesWritten < YMM_REGSIZE_BYTES) - { - regSize = XMM_REGSIZE_BYTES; - } } size -= bytesWritten; - // Handle the remainder by overlapping with previously processed data (only for zeroing) - if (zeroing && (size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES)) + // Handle the remainder by overlapping with previously processed data + if ((size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES)) { - if (isPow2(size) && (size <= REGSIZE_BYTES)) - { - // For sizes like 1,2,4 and 8 we delegate handling to normal stores - // because that will be a single instruction that is smaller than SIMD mov - } - else - { - // Get optimal register size to cover the whole remainder (with overlapping) - regSize = compiler->roundUpSIMDSize(size); + // Get optimal register size to cover the whole remainder (with overlapping) + regSize = compiler->roundUpSIMDSize(size); - // Rewind dstOffset so we can fit a vector for the while remainder - dstOffset -= (regSize - size); - emitSimdMovs(); - size = 0; - } + // Rewind dstOffset so we can fit a vector for the while remainder + dstOffset -= (regSize - size); + emitSimdMovs(); + size = 0; } } +#endif // FEATURE_SIMD assert((srcIntReg != REG_NA) || (size == 0)); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index ca3c64dd2ff24..d780773abba69 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -327,7 +327,6 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) } else { - // The fill value of an initblk is interpreted to hold a // value of (unsigned int8) however a constant of any size // may practically reside on the evaluation stack. So extract @@ -337,38 +336,22 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) ssize_t fill = src->AsIntCon()->IconValue() & 0xFF; - if (fill == 0) + const bool canUseSimd = !blkNode->IsOnHeapAndContainsReferences() && comp->IsBaselineSimdIsaSupported(); + if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset, canUseSimd)) { - if (size >= XMM_REGSIZE_BYTES) - { - const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences(); -#ifdef TARGET_AMD64 - - bool willUseOnlySimdMov = size % XMM_REGSIZE_BYTES == 0; - if (!willUseOnlySimdMov) - { - // If we have a remainder we still might only use SIMD to process it (via overlapping) - // unless it's more efficient to do that via scalar op (for sizes 1,2,4 and 8) - const unsigned remainder = size % XMM_REGSIZE_BYTES; - if (!isPow2(remainder) || (remainder > REGSIZE_BYTES)) - { - willUseOnlySimdMov = true; - } - } -#else - const bool willUseOnlySimdMov = (size % 8 == 0); -#endif - if (willUseOnlySimdMov && canUse16BytesSimdMov) - { - src->SetContained(); - } - else if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset, - /*canUseSimd*/ canUse16BytesSimdMov)) - { - // It turns out we can't use SIMD so the default threshold is too big - goto TOO_BIG_TO_UNROLL; - } - } + // It turns out we can't use SIMD so the default threshold is too big + goto TOO_BIG_TO_UNROLL; + } + if (canUseSimd && (size >= XMM_REGSIZE_BYTES)) + { + // We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register + // with a fill value just to handle the remainder when we can do that with + // an overlapped SIMD load). + src->SetContained(); + } + else if (fill == 0) + { + // Leave as is - zero shouldn't be contained when we don't use SIMD. } #ifdef TARGET_AMD64 else if (size >= REGSIZE_BYTES)