From de1d3e2865d2bde5c701f30e5c3b37528a3b7000 Mon Sep 17 00:00:00 2001 From: ravil-mobile Date: Mon, 25 Nov 2024 14:40:39 +0000 Subject: [PATCH] [AMD] Adjusted ordering local stores and global loads for GEMMs --- third_party/amd/backend/compiler.py | 2 +- .../include/TritonAMDGPUTransforms/Passes.h | 4 +- .../include/TritonAMDGPUTransforms/Passes.td | 9 +++- .../ReorderInstructions.cpp | 44 ++++++++++++++----- third_party/amd/python/triton_amd.cc | 5 ++- 5 files changed, 49 insertions(+), 15 deletions(-) diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py index c222be2cd64d..2c8fe0309e22 100644 --- a/third_party/amd/backend/compiler.py +++ b/third_party/amd/backend/compiler.py @@ -248,7 +248,7 @@ def make_ttgir(mod, metadata, options): passes.ttgpuir.add_remove_layout_conversions(pm) passes.ttgpuir.add_reduce_data_duplication(pm) if amd.has_matrix_core_feature(options.arch): - amd.passes.ttgpuir.add_reorder_instructions(pm) + amd.passes.ttgpuir.add_reorder_instructions(pm, options.num_stages, stream_prefetch) if use_buffer_ops: amd.passes.ttgpuir.add_canonicalize_pointers(pm) diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.h b/third_party/amd/include/TritonAMDGPUTransforms/Passes.h index 636743d305f9..68b2c9c20244 100644 --- a/third_party/amd/include/TritonAMDGPUTransforms/Passes.h +++ b/third_party/amd/include/TritonAMDGPUTransforms/Passes.h @@ -17,7 +17,9 @@ createTritonAMDGPUAccelerateMatmulPass(std::string archGenName = std::string(), std::unique_ptr createTritonAMDGPUCanonicalizeLoopsPass(); -std::unique_ptr createTritonAMDGPUReorderInstructionsPass(); +std::unique_ptr +createTritonAMDGPUReorderInstructionsPass(int32_t numStages, + bool streamPrefetch); std::unique_ptr createTritonAMDGPUVerifier(); diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td index 85604dcaca18..33e78df4e24d 100644 --- a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td +++ b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td @@ -109,9 +109,16 @@ def TritonAMDGPUReorderInstructions: Pass<"tritonamdgpu-reorder-instructions", " "conversions from shared memory before their first use) and (2) promote LLVM instruction " "order more friendly to `ptxas`."; - let constructor = "mlir::createTritonAMDGPUReorderInstructionsPass()"; + let constructor = "mlir::createTritonAMDGPUReorderInstructionsPass(2, false)"; let dependentDialects = []; + + let options = [ + Option<"numStages", "num_stages", "int32_t", /*default*/"2", + "number of pipeline stages">, + Option<"streamPrefetch", "local_prefetch", "bool", /*default*/"false", + "indicates whether stream prefetch is enabled">, + ]; } def TritonAMDGPUConvertToBufferOps : Pass<"tritonamdgpu-convert-buffer-ops", "mlir::ModuleOp"> { diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp index bb427b7ef4e4..d3e5857797d0 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp @@ -209,14 +209,26 @@ static void moveUpTranspose(triton::FuncOp funcOp) { } // Schedule global load and local store ops for better GEMM performance. -static void scheduleGlobalLoadLocalStore(scf::ForOp forOp) { +static void +scheduleGlobalLoadLocalStore(scf::ForOp forOp, + const bool independentGlobalLoadStages) { SmallVector moveOps; - // Move global loads early to prefetch. This may increase register pressure - // but it enables issuing global loads early. - forOp.walk([&](triton::LoadOp op) { moveOps.push_back(op); }); - // Move local_stores early if dependence distance greater than one iteration. - // Best perf on GEMM when these precede global loads. - forOp.walk([&](ttg::LocalStoreOp op) { moveOps.push_back(op); }); + + if (independentGlobalLoadStages) { + // Move local stores early to prefetch. It results in moving the + // corresponding memory fence to the very top of the current basic block. + // This results in better instruction interleaving + // - i.e., `ds_write`, `global/buffer_loads`. + forOp.walk([&](ttg::LocalStoreOp op) { moveOps.push_back(op); }); + forOp.walk([&](triton::LoadOp op) { moveOps.push_back(op); }); + } else { + // Move global loads early to prefetch. This may increase register pressure + // but it enables issuing global loads early. + forOp.walk([&](triton::LoadOp op) { moveOps.push_back(op); }); + // Move local_stores early if dependence distance greater than one + // iteration. Best perf on GEMM when these precede global loads. + forOp.walk([&](ttg::LocalStoreOp op) { moveOps.push_back(op); }); + } for (auto op : llvm::reverse(moveOps)) { // Gather use-def chain in block. @@ -360,6 +372,13 @@ namespace { struct TritonAMDGPUReorderInstructionsPass : public TritonAMDGPUReorderInstructionsBase< TritonAMDGPUReorderInstructionsPass> { + + explicit TritonAMDGPUReorderInstructionsPass(int32_t numStages, + bool streamPrefetch) { + this->numStages = numStages; + this->streamPrefetch = streamPrefetch; + } + void runOnOperation() override { ModuleOp m = getOperation(); for (auto funcOp : m.getOps()) { @@ -370,10 +389,12 @@ struct TritonAMDGPUReorderInstructionsPass moveUpTranspose(funcOp); + const bool independentGlobalLoadStages = + this->numStages > 2 || this->streamPrefetch; SmallVector leafForOps = triton::AMD::getLeafForOps(funcOp); for (auto forOp : leafForOps) { if (isPureMatmulProblem(forOp)) { - scheduleGlobalLoadLocalStore(forOp); + scheduleGlobalLoadLocalStore(forOp, independentGlobalLoadStages); sinkSecondLoad(forOp); } } @@ -382,6 +403,9 @@ struct TritonAMDGPUReorderInstructionsPass }; } // namespace -std::unique_ptr mlir::createTritonAMDGPUReorderInstructionsPass() { - return std::make_unique(); +std::unique_ptr +mlir::createTritonAMDGPUReorderInstructionsPass(int32_t numStages, + bool streamPrefetch) { + return std::make_unique(numStages, + streamPrefetch); } diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc index 3c335099104d..91de061b2b62 100644 --- a/third_party/amd/python/triton_amd.cc +++ b/third_party/amd/python/triton_amd.cc @@ -70,8 +70,9 @@ void init_triton_amd_passes_ttgpuir(py::module &&m) { mlir::createTritonAMDGPUCanonicalizePointersPass); ADD_PASS_WRAPPER_0("add_convert_to_buffer_ops", mlir::createTritonAMDGPUConvertToBufferOpsPass); - ADD_PASS_WRAPPER_0("add_reorder_instructions", - mlir::createTritonAMDGPUReorderInstructionsPass); + ADD_PASS_WRAPPER_2("add_reorder_instructions", + mlir::createTritonAMDGPUReorderInstructionsPass, int32_t, + bool); ADD_PASS_WRAPPER_2("add_stream_pipelinev2", mlir::createTritonAMDGPUStreamPipelineV2Pass, int, int); }