Skip to content

Commit

Permalink
JIT: Refactor 3-opt utilities to facilitate expansion (#109982)
Browse files Browse the repository at this point in the history
Part of #107749. Follow-up to #103450. To facilitate implementing a global variant of 3-opt alongside the greedy variant, this moves some shared components to helper methods. I want to do this as a separate PR to ensure this change is truly no-diff, and has minimal (if any) TP impact.
  • Loading branch information
amanasifkhalid authored Nov 20, 2024
1 parent 5c218ab commit 00ba6d5
Show file tree
Hide file tree
Showing 2 changed files with 171 additions and 89 deletions.
4 changes: 4 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -6290,9 +6290,13 @@ class Compiler
#endif // DEBUG

weight_t GetCost(BasicBlock* block, BasicBlock* next);
bool TrySwappingPartitions(unsigned s1Start, unsigned s2Start, unsigned s3Start, unsigned s3End, unsigned s4End);

void ConsiderEdge(FlowEdge* edge);
void AddNonFallthroughSuccs(unsigned blockPos);
void AddNonFallthroughPreds(unsigned blockPos);
bool RunGreedyThreeOptPass(unsigned startPos, unsigned endPos);

bool RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock);

public:
Expand Down
256 changes: 167 additions & 89 deletions src/coreclr/jit/fgopt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4961,6 +4961,102 @@ weight_t Compiler::ThreeOptLayout::GetCost(BasicBlock* block, BasicBlock* next)
return maxCost;
}

//-----------------------------------------------------------------------------
// Compiler::ThreeOptLayout::TrySwappingPartitions: Evaluates the cost of swapping the given partitions.
// If it is profitable, write the swapped partitions back to 'blockOrder'.
//
// Parameters:
// s1Start - The starting position of the first partition
// s2Start - The starting position of the second partition
// s3Start - The starting position of the third partition
// s3End - The ending position (inclusive) of the third partition
// s4End - The ending position (inclusive) of the fourth partition
//
// Returns:
// True if the swap was performed, false otherwise
//
// Notes:
// Here is the proposed partition:
// S1: s1Start ~ s2Start-1
// S2: s2Start ~ s3Start-1
// S3: s3Start ~ s3End
// S4: remaining blocks
//
// After the swap:
// S1: s1Start ~ s2Start-1
// S3: s3Start ~ s3End
// S2: s2Start ~ s3Start-1
// S4: remaining blocks
//
// If 's3End' and 's4End' are the same, the fourth partition doesn't exist.
//
bool Compiler::ThreeOptLayout::TrySwappingPartitions(
unsigned s1Start, unsigned s2Start, unsigned s3Start, unsigned s3End, unsigned s4End)
{
BasicBlock* const s2Block = blockOrder[s2Start];
BasicBlock* const s2BlockPrev = blockOrder[s2Start - 1];
BasicBlock* const s3Block = blockOrder[s3Start];
BasicBlock* const s3BlockPrev = blockOrder[s3Start - 1];
BasicBlock* const lastBlock = blockOrder[s3End];

// Evaluate the cost of swapping S2 and S3
weight_t currCost = GetCost(s2BlockPrev, s2Block) + GetCost(s3BlockPrev, s3Block);
weight_t newCost = GetCost(s2BlockPrev, s3Block) + GetCost(lastBlock, s2Block);

// Consider flow into S4, if the partition exists
if (s3End < s4End)
{
BasicBlock* const s4StartBlock = blockOrder[s3End + 1];
currCost += GetCost(lastBlock, s4StartBlock);
newCost += GetCost(s3BlockPrev, s4StartBlock);
}
else
{
assert(s3End == s4End);
currCost += lastBlock->bbWeight;
newCost += s3BlockPrev->bbWeight;
}

// Check if the swap is profitable
if ((newCost >= currCost) || Compiler::fgProfileWeightsEqual(newCost, currCost, 0.001))
{
return false;
}

// We've found a profitable cut point. Continue with the swap.
JITDUMP("Swapping partitions [" FMT_BB ", " FMT_BB "] and [" FMT_BB ", " FMT_BB
"] (current partition cost = %f, new partition cost = %f)\n",
s2Block->bbNum, s3BlockPrev->bbNum, s3Block->bbNum, lastBlock->bbNum, currCost, newCost);
INDEBUG(const weight_t currLayoutCost = GetLayoutCost(s1Start, s4End));

// Swap the partitions
const unsigned s1Size = s2Start - s1Start;
const unsigned s2Size = s3Start - s2Start;
const unsigned s3Size = (s3End + 1) - s3Start;
BasicBlock** const regionStart = blockOrder + s1Start;
BasicBlock** const tempStart = tempOrder + s1Start;
memcpy(tempStart, regionStart, sizeof(BasicBlock*) * s1Size);
memcpy(tempStart + s1Size, regionStart + s1Size + s2Size, sizeof(BasicBlock*) * s3Size);
memcpy(tempStart + s1Size + s3Size, regionStart + s1Size, sizeof(BasicBlock*) * s2Size);

// Copy remaining blocks in S4 over
const unsigned numBlocks = (s4End - s1Start) + 1;
const unsigned swappedSize = s1Size + s2Size + s3Size;
const unsigned remainingSize = numBlocks - swappedSize;
assert(numBlocks >= swappedSize);
memcpy(tempStart + swappedSize, regionStart + swappedSize, sizeof(BasicBlock*) * remainingSize);

std::swap(blockOrder, tempOrder);

#ifdef DEBUG
// Ensure the swap improved the overall layout. Tolerate some imprecision.
const weight_t newLayoutCost = GetLayoutCost(s1Start, s4End);
assert((newLayoutCost < currLayoutCost) || Compiler::fgProfileWeightsEqual(newLayoutCost, currLayoutCost, 0.001));
#endif // DEBUG

return true;
}

//-----------------------------------------------------------------------------
// Compiler::ThreeOptLayout::ConsiderEdge: Adds 'edge' to 'cutPoints' for later consideration
// if 'edge' looks promising, and it hasn't been considered already.
Expand Down Expand Up @@ -5185,40 +5281,37 @@ void Compiler::ThreeOptLayout::Run()
}

//-----------------------------------------------------------------------------
// Compiler::ThreeOptLayout::RunThreeOptPass: Runs 3-opt for the given block range.
// Compiler::ThreeOptLayout::RunGreedyThreeOptPass: Runs 3-opt for the given block range,
// using a greedy strategy for finding partitions to swap.
//
// Parameters:
// startBlock - The first block of the range to reorder
// endBlock - The last block (inclusive) of the range to reorder
//
bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock)
// Returns:
// True if we reordered anything, false otherwise
//
// Notes:
// For methods with more than a trivial number of basic blocks,
// iteratively trying every cut point is prohibitively expensive.
// Instead, add the non-fallthrough successor edges of each block to a priority queue,
// and try to create fallthrough on each edge via partition swaps, starting with the hottest edges.
// For each swap, repopulate the priority queue with edges along the modified cut points.
//
bool Compiler::ThreeOptLayout::RunGreedyThreeOptPass(unsigned startPos, unsigned endPos)
{
assert(startBlock != nullptr);
assert(endBlock != nullptr);
assert(cutPoints.Empty());
assert(startPos < endPos);
bool modified = false;

bool modified = false;
const unsigned startPos = ordinals[startBlock->bbNum];
const unsigned endPos = ordinals[endBlock->bbNum];
const unsigned numBlocks = (endPos - startPos + 1);
assert((startPos != 0) || startBlock->IsFirst());
assert(startPos <= endPos);

if (numBlocks < 3)
{
JITDUMP("Not enough blocks to partition anything. Skipping reordering.\n");
return false;
}
JITDUMP("Using greedy strategy for finding cut points.\n");

// Initialize cutPoints with candidate branches in this section
for (unsigned position = startPos; position <= endPos; position++)
{
AddNonFallthroughSuccs(position);
}

INDEBUG(weight_t currLayoutCost = GetLayoutCost(startPos, endPos);)
JITDUMP("Initial layout cost: %f\n", currLayoutCost);

// For each candidate edge, determine if it's profitable to partition after the source block
// and before the destination block, and swap the partitions to create fallthrough.
// If it is, do the swap, and for the blocks before/after each cut point that lost fallthrough,
Expand Down Expand Up @@ -5261,8 +5354,7 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc
// If the new cost improves upon the current cost, then we can justify the swap.

const bool isForwardJump = (srcPos < dstPos);
weight_t currCost, newCost;
unsigned part1Size, part2Size, part3Size;
unsigned s2Start, s3Start, s3End;

if (isForwardJump)
{
Expand All @@ -5277,20 +5369,12 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc
// S3: dstPos ~ endPos
// S2: srcPos+1 ~ dstPos-1
// S4: remaining blocks
part1Size = srcPos - startPos + 1;
part2Size = dstPos - srcPos - 1;
part3Size = endPos - dstPos + 1;

// Don't include branches into S4 in the cost calculation,
// since we're only considering branches within this region.
currCost = GetCost(srcBlk, blockOrder[srcPos + 1]) + GetCost(blockOrder[dstPos - 1], dstBlk) +
blockOrder[endPos]->bbWeight;
newCost = GetCost(srcBlk, dstBlk) + GetCost(blockOrder[endPos], blockOrder[srcPos + 1]) +
blockOrder[dstPos - 1]->bbWeight;
s2Start = srcPos + 1;
s3Start = dstPos;
s3End = endPos;
}
else
{
assert(dstPos < srcPos);

// Here is the proposed partition:
// S1: startPos ~ dstPos-1
Expand All @@ -5303,93 +5387,87 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc
// S3: srcPos
// S2: dstPos ~ srcPos-1
// S4: srcPos+1 ~ endPos
part1Size = dstPos - startPos;
part2Size = srcPos - dstPos;
part3Size = 1;

currCost = GetCost(blockOrder[srcPos - 1], srcBlk) + GetCost(blockOrder[dstPos - 1], dstBlk);
newCost = GetCost(srcBlk, dstBlk) + GetCost(blockOrder[dstPos - 1], srcBlk);

if (srcPos != endPos)
{
currCost += GetCost(srcBlk, blockOrder[srcPos + 1]);
newCost += GetCost(blockOrder[srcPos - 1], blockOrder[srcPos + 1]);
}
else
{
currCost += srcBlk->bbWeight;
newCost += blockOrder[srcPos - 1]->bbWeight;
}
s2Start = dstPos;
s3Start = srcPos;
s3End = srcPos;
}

// Continue evaluating partitions if this one isn't profitable
if ((newCost >= currCost) || Compiler::fgProfileWeightsEqual(currCost, newCost, 0.001))
if (!TrySwappingPartitions(startPos, s2Start, s3Start, s3End, endPos))
{
continue;
}

// We've found a profitable cut point. Continue with the swap.
JITDUMP("Creating fallthrough for " FMT_BB " -> " FMT_BB
" (current partition cost = %f, new partition cost = %f)\n",
srcBlk->bbNum, dstBlk->bbNum, currCost, newCost);

// Swap the partitions
BasicBlock** const regionStart = blockOrder + startPos;
BasicBlock** const tempStart = tempOrder + startPos;
memcpy(tempStart, regionStart, sizeof(BasicBlock*) * part1Size);
memcpy(tempStart + part1Size, regionStart + part1Size + part2Size, sizeof(BasicBlock*) * part3Size);
memcpy(tempStart + part1Size + part3Size, regionStart + part1Size, sizeof(BasicBlock*) * part2Size);

// For backward jumps, there might be additional blocks after 'srcBlk' that need to be copied over.
const unsigned swappedSize = part1Size + part2Size + part3Size;
const unsigned remainingSize = numBlocks - swappedSize;
assert(numBlocks >= swappedSize);
assert((remainingSize == 0) || !isForwardJump);
memcpy(tempStart + swappedSize, regionStart + swappedSize, sizeof(BasicBlock*) * remainingSize);

std::swap(blockOrder, tempOrder);

// Update the ordinals for the blocks we moved
for (unsigned i = (startPos + part1Size); i <= endPos; i++)
for (unsigned i = s2Start; i <= endPos; i++)
{
ordinals[blockOrder[i]->bbNum] = i;
}

// Ensure this move created fallthrough from 'srcBlk' to 'dstBlk'
assert((ordinals[srcBlk->bbNum] + 1) == ordinals[dstBlk->bbNum]);
modified = true;

// At every cut point is an opportunity to consider more candidate edges.
// To the left of each cut point, consider successor edges that don't fall through.
// Ditto predecessor edges to the right of each cut point.
AddNonFallthroughSuccs(startPos + part1Size - 1);
AddNonFallthroughSuccs(startPos + part1Size + part2Size - 1);
AddNonFallthroughSuccs(startPos + part1Size + part2Size + part3Size - 1);
AddNonFallthroughSuccs(s2Start - 1);
AddNonFallthroughPreds(s2Start);
AddNonFallthroughSuccs(s3Start - 1);
AddNonFallthroughPreds(s3Start);
AddNonFallthroughSuccs(s3End);

AddNonFallthroughPreds(startPos + part1Size);
AddNonFallthroughPreds(startPos + part1Size + part2Size);

if (remainingSize != 0)
if (s3End < endPos)
{
AddNonFallthroughPreds(startPos + part1Size + part2Size + part3Size);
AddNonFallthroughPreds(s3End + 1);
}

#ifdef DEBUG
// Ensure the swap improved the overall layout. Tolerate some imprecision.
const weight_t newLayoutCost = GetLayoutCost(startPos, endPos);
assert((newLayoutCost < currLayoutCost) ||
Compiler::fgProfileWeightsEqual(newLayoutCost, currLayoutCost, 0.001));
currLayoutCost = newLayoutCost;
#endif // DEBUG
modified = true;
}

return modified;
}

//-----------------------------------------------------------------------------
// Compiler::ThreeOptLayout::RunThreeOptPass: Runs 3-opt for the given block range.
//
// Parameters:
// startBlock - The first block of the range to reorder
// endBlock - The last block (inclusive) of the range to reorder
//
// Returns:
// True if we reordered anything, false otherwise
//
bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock)
{
assert(startBlock != nullptr);
assert(endBlock != nullptr);

const unsigned startPos = ordinals[startBlock->bbNum];
const unsigned endPos = ordinals[endBlock->bbNum];
const unsigned numBlocks = (endPos - startPos + 1);
assert((startPos != 0) || startBlock->IsFirst());
assert(startPos <= endPos);

if (numBlocks < 3)
{
JITDUMP("Not enough blocks to partition anything. Skipping reordering.\n");
return false;
}

JITDUMP("Initial layout cost: %f\n", GetLayoutCost(startPos, endPos));
const bool modified = RunGreedyThreeOptPass(startPos, endPos);

// Write back to 'tempOrder' so changes to this region aren't lost next time we swap 'tempOrder' and 'blockOrder'
if (modified)
{
memcpy(tempOrder + startPos, blockOrder + startPos, sizeof(BasicBlock*) * numBlocks);
JITDUMP("Final layout cost: %f\n", GetLayoutCost(startPos, endPos));
}
else
{
JITDUMP("No changes made.\n");
}

JITDUMP("Final layout cost: %f\n", currLayoutCost);
return modified;
}

Expand Down

0 comments on commit 00ba6d5

Please sign in to comment.