diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 91daacdb3837c..1ee65bec99cbc 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6235,8 +6235,9 @@ class Compiler void ConsiderEdge(FlowEdge* edge); void AddNonFallthroughSuccs(unsigned blockPos); void AddNonFallthroughPreds(unsigned blockPos); - bool RunGreedyThreeOptPass(unsigned startPos, unsigned endPos); + bool RunGreedyThreeOptPass(unsigned startPos, unsigned endPos); + bool RunGlobalThreeOptPass(unsigned startPos, unsigned endPos); bool RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock); public: diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 978555e03684b..899fcf3ebd947 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5288,6 +5288,8 @@ void Compiler::ThreeOptLayout::Run() } } } + + INDEBUG(compiler->Metrics.BasicBlockLayoutCost = GetLayoutCost(0, numCandidateBlocks - 1)); } //----------------------------------------------------------------------------- @@ -5437,6 +5439,93 @@ bool Compiler::ThreeOptLayout::RunGreedyThreeOptPass(unsigned startPos, unsigned return modified; } +//----------------------------------------------------------------------------- +// Compiler::ThreeOptLayout::RunGlobalThreeOptPass: Runs 3-opt for the given block range, +// trying every possible cut point until convergence. +// +// Parameters: +// startBlock - The first block of the range to reorder +// endBlock - The last block (inclusive) of the range to reorder +// +// Returns: +// True if we reordered anything, false otherwise +// +// Notes: +// The search for cut points is quadratic on the number of blocks in the region being reordered. +// This search is repeated until the cost model converges. +// This approach is impractically expensive unless the region is small, or already close to an optimal layout. +// +bool Compiler::ThreeOptLayout::RunGlobalThreeOptPass(unsigned startPos, unsigned endPos) +{ + assert(startPos < endPos); + bool modified = false, foundPartition; + + auto isValidCutPoint = [this](BasicBlock* block) -> bool { + // Don't split up call-finally pairs. + // Also, don't bother reordering handler blocks. + // Finally, don't reorder nested try regions -- we order each region individually. + return !block->isBBCallFinallyPairTail() && !block->hasHndIndex() && (block->bbTryIndex == currEHRegion); + }; + + JITDUMP("Using global strategy for finding cut points.\n"); + + do + { + foundPartition = false; + for (unsigned s2Start = startPos + 1; !foundPartition && (s2Start < endPos); s2Start++) + { + BasicBlock* const s2Block = blockOrder[s2Start]; + if (!isValidCutPoint(s2Block)) + { + continue; + } + + for (unsigned s3Start = s2Start + 1; !foundPartition && (s3Start <= endPos); s3Start++) + { + BasicBlock* const s3Block = blockOrder[s3Start]; + if (!isValidCutPoint(s3Block)) + { + continue; + } + + for (unsigned s3End = s3Start; s3End < endPos; s3End++) + { + BasicBlock* const s4Block = blockOrder[s3End + 1]; + if (!isValidCutPoint(s4Block)) + { + continue; + } + + if (TrySwappingPartitions(startPos, s2Start, s3Start, s3End, endPos)) + { + foundPartition = true; + modified = true; + break; + } + } + + if (!foundPartition && TrySwappingPartitions(startPos, s2Start, s3Start, endPos, endPos)) + { + foundPartition = true; + modified = true; + } + } + } + // Keep searching for cut points until the cost model converges + } while (foundPartition); + + // Update ordinals, but only if we reordered anything, and if we will do another pass + if (modified && (currEHRegion != 0)) + { + for (unsigned i = startPos; i <= endPos; i++) + { + ordinals[blockOrder[i]->bbNum] = i; + } + } + + return modified; +} + //----------------------------------------------------------------------------- // Compiler::ThreeOptLayout::RunThreeOptPass: Runs 3-opt for the given block range. // @@ -5465,7 +5554,8 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc } JITDUMP("Initial layout cost: %f\n", GetLayoutCost(startPos, endPos)); - const bool modified = RunGreedyThreeOptPass(startPos, endPos); + const bool modified = JitConfig.JitDoGlobalThreeOpt() ? RunGlobalThreeOptPass(startPos, endPos) + : RunGreedyThreeOptPass(startPos, endPos); // Write back to 'tempOrder' so changes to this region aren't lost next time we swap 'tempOrder' and 'blockOrder' if (modified) diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 36162934bdf1b..eda4517c0386c 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -781,9 +781,12 @@ RELEASE_CONFIG_INTEGER(JitEnablePhysicalPromotion, "JitEnablePhysicalPromotion", // Enable cross-block local assertion prop RELEASE_CONFIG_INTEGER(JitEnableCrossBlockLocalAssertionProp, "JitEnableCrossBlockLocalAssertionProp", 1) -// Do greedy RPO-based layout in Compiler::fgReorderBlocks. +// Do greedy RPO-based block layout. RELEASE_CONFIG_INTEGER(JitDoReversePostOrderLayout, "JitDoReversePostOrderLayout", 1); +// Globally search for cut points in 3-opt layout instead of using the greedy strategy. +RELEASE_CONFIG_INTEGER(JitDoGlobalThreeOpt, "JitDoGlobalThreeOpt", 1); + // Enable strength reduction RELEASE_CONFIG_INTEGER(JitEnableStrengthReduction, "JitEnableStrengthReduction", 1) diff --git a/src/coreclr/jit/jitmetadatalist.h b/src/coreclr/jit/jitmetadatalist.h index 4642ca14d7b7c..3908c16e33456 100644 --- a/src/coreclr/jit/jitmetadatalist.h +++ b/src/coreclr/jit/jitmetadatalist.h @@ -50,6 +50,7 @@ JITMETADATAMETRIC(RedundantBranchesEliminated, int, JIT_M JITMETADATAMETRIC(JumpThreadingsPerformed, int, JIT_METADATA_HIGHER_IS_BETTER) JITMETADATAMETRIC(CseCount, int, 0) JITMETADATAMETRIC(BasicBlocksAtCodegen, int, 0) +JITMETADATAMETRIC(BasicBlockLayoutCost, double, JIT_METADATA_LOWER_IS_BETTER) JITMETADATAMETRIC(PerfScore, double, JIT_METADATA_LOWER_IS_BETTER) JITMETADATAMETRIC(BytesAllocated, int64_t, JIT_METADATA_LOWER_IS_BETTER) JITMETADATAMETRIC(ImporterBranchFold, int, 0)