From de1d3e2865d2bde5c701f30e5c3b37528a3b7000 Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com@gmail.com>
Date: Mon, 25 Nov 2024 14:40:39 +0000
Subject: [PATCH] [AMD] Adjusted ordering local stores and global loads for
 GEMMs

---
 third_party/amd/backend/compiler.py           |  2 +-
 .../include/TritonAMDGPUTransforms/Passes.h   |  4 +-
 .../include/TritonAMDGPUTransforms/Passes.td  |  9 +++-
 .../ReorderInstructions.cpp                   | 44 ++++++++++++++-----
 third_party/amd/python/triton_amd.cc          |  5 ++-
 5 files changed, 49 insertions(+), 15 deletions(-)
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
index c222be2cd64d..2c8fe0309e22 100644
--- a/third_party/amd/backend/compiler.py
+++ b/third_party/amd/backend/compiler.py
@@ -248,7 +248,7 @@ def make_ttgir(mod, metadata, options):
         passes.ttgpuir.add_remove_layout_conversions(pm)
         passes.ttgpuir.add_reduce_data_duplication(pm)
         if amd.has_matrix_core_feature(options.arch):
-            amd.passes.ttgpuir.add_reorder_instructions(pm)
+            amd.passes.ttgpuir.add_reorder_instructions(pm, options.num_stages, stream_prefetch)
 
         if use_buffer_ops:
             amd.passes.ttgpuir.add_canonicalize_pointers(pm)
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.h b/third_party/amd/include/TritonAMDGPUTransforms/Passes.h
index 636743d305f9..68b2c9c20244 100644
--- a/third_party/amd/include/TritonAMDGPUTransforms/Passes.h
+++ b/third_party/amd/include/TritonAMDGPUTransforms/Passes.h
@@ -17,7 +17,9 @@ createTritonAMDGPUAccelerateMatmulPass(std::string archGenName = std::string(),
 
 std::unique_ptr<Pass> createTritonAMDGPUCanonicalizeLoopsPass();
 
-std::unique_ptr<Pass> createTritonAMDGPUReorderInstructionsPass();
+std::unique_ptr<Pass>
+createTritonAMDGPUReorderInstructionsPass(int32_t numStages,
+                                          bool streamPrefetch);
 
 std::unique_ptr<Pass> createTritonAMDGPUVerifier();
 
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
index 85604dcaca18..33e78df4e24d 100644
--- a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
+++ b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
@@ -109,9 +109,16 @@ def TritonAMDGPUReorderInstructions: Pass<"tritonamdgpu-reorder-instructions", "
                     "conversions from shared memory before their first use) and (2) promote LLVM instruction "
                     "order more friendly to `ptxas`.";
 
-  let constructor = "mlir::createTritonAMDGPUReorderInstructionsPass()";
+  let constructor = "mlir::createTritonAMDGPUReorderInstructionsPass(2, false)";
 
   let dependentDialects = [];
+
+  let options = [
+      Option<"numStages", "num_stages", "int32_t", /*default*/"2",
+              "number of pipeline stages">,
+      Option<"streamPrefetch", "local_prefetch", "bool", /*default*/"false",
+             "indicates whether stream prefetch is enabled">,
+  ];
 }
 
 def TritonAMDGPUConvertToBufferOps : Pass<"tritonamdgpu-convert-buffer-ops", "mlir::ModuleOp"> {
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
index bb427b7ef4e4..d3e5857797d0 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
@@ -209,14 +209,26 @@ static void moveUpTranspose(triton::FuncOp funcOp) {
 }
 
 // Schedule global load and local store ops for better GEMM performance.
-static void scheduleGlobalLoadLocalStore(scf::ForOp forOp) {
+static void
+scheduleGlobalLoadLocalStore(scf::ForOp forOp,
+                             const bool independentGlobalLoadStages) {
   SmallVector<Operation *> moveOps;
-  // Move global loads early to prefetch. This may increase register pressure
-  // but it enables issuing global loads early.
-  forOp.walk([&](triton::LoadOp op) { moveOps.push_back(op); });
-  // Move local_stores early if dependence distance greater than one iteration.
-  // Best perf on GEMM when these precede global loads.
-  forOp.walk([&](ttg::LocalStoreOp op) { moveOps.push_back(op); });
+
+  if (independentGlobalLoadStages) {
+    // Move local stores early to prefetch. It results in moving the
+    // corresponding  memory fence to the very top of the current basic block.
+    // This results in better instruction interleaving
+    // - i.e., `ds_write`, `global/buffer_loads`.
+    forOp.walk([&](ttg::LocalStoreOp op) { moveOps.push_back(op); });
+    forOp.walk([&](triton::LoadOp op) { moveOps.push_back(op); });
+  } else {
+    // Move global loads early to prefetch. This may increase register pressure
+    // but it enables issuing global loads early.
+    forOp.walk([&](triton::LoadOp op) { moveOps.push_back(op); });
+    // Move local_stores early if dependence distance greater than one
+    // iteration. Best perf on GEMM when these precede global loads.
+    forOp.walk([&](ttg::LocalStoreOp op) { moveOps.push_back(op); });
+  }
 
   for (auto op : llvm::reverse(moveOps)) {
     // Gather use-def chain in block.
@@ -360,6 +372,13 @@ namespace {
 struct TritonAMDGPUReorderInstructionsPass
     : public TritonAMDGPUReorderInstructionsBase<
           TritonAMDGPUReorderInstructionsPass> {
+
+  explicit TritonAMDGPUReorderInstructionsPass(int32_t numStages,
+                                               bool streamPrefetch) {
+    this->numStages = numStages;
+    this->streamPrefetch = streamPrefetch;
+  }
+
   void runOnOperation() override {
     ModuleOp m = getOperation();
     for (auto funcOp : m.getOps<triton::FuncOp>()) {
@@ -370,10 +389,12 @@ struct TritonAMDGPUReorderInstructionsPass
 
       moveUpTranspose(funcOp);
 
+      const bool independentGlobalLoadStages =
+          this->numStages > 2 || this->streamPrefetch;
       SmallVector<scf::ForOp> leafForOps = triton::AMD::getLeafForOps(funcOp);
       for (auto forOp : leafForOps) {
         if (isPureMatmulProblem(forOp)) {
-          scheduleGlobalLoadLocalStore(forOp);
+          scheduleGlobalLoadLocalStore(forOp, independentGlobalLoadStages);
           sinkSecondLoad(forOp);
         }
       }
@@ -382,6 +403,9 @@ struct TritonAMDGPUReorderInstructionsPass
 };
 } // namespace
 
-std::unique_ptr<Pass> mlir::createTritonAMDGPUReorderInstructionsPass() {
-  return std::make_unique<TritonAMDGPUReorderInstructionsPass>();
+std::unique_ptr<Pass>
+mlir::createTritonAMDGPUReorderInstructionsPass(int32_t numStages,
+                                                bool streamPrefetch) {
+  return std::make_unique<TritonAMDGPUReorderInstructionsPass>(numStages,
+                                                               streamPrefetch);
 }
diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc
index 3c335099104d..91de061b2b62 100644
--- a/third_party/amd/python/triton_amd.cc
+++ b/third_party/amd/python/triton_amd.cc
@@ -70,8 +70,9 @@ void init_triton_amd_passes_ttgpuir(py::module &&m) {
                      mlir::createTritonAMDGPUCanonicalizePointersPass);
   ADD_PASS_WRAPPER_0("add_convert_to_buffer_ops",
                      mlir::createTritonAMDGPUConvertToBufferOpsPass);
-  ADD_PASS_WRAPPER_0("add_reorder_instructions",
-                     mlir::createTritonAMDGPUReorderInstructionsPass);
+  ADD_PASS_WRAPPER_2("add_reorder_instructions",
+                     mlir::createTritonAMDGPUReorderInstructionsPass, int32_t,
+                     bool);
   ADD_PASS_WRAPPER_2("add_stream_pipelinev2",
                      mlir::createTritonAMDGPUStreamPipelineV2Pass, int, int);
 }