Merge pull request #943 from xmos/async_flash

Async flash loads
xmos · Dec 1, 2024 · 0a63d97 · 0a63d97
2 parents f60ef11 + 5cf1f6b
commit 0a63d97
Show file tree

Hide file tree

Showing 10 changed files with 120 additions and 12 deletions.
diff --git a/third_party/lib_tflite_micro b/third_party/lib_tflite_micro
diff --git a/xformer/Analysis/MemoryPlan.cpp b/xformer/Analysis/MemoryPlan.cpp
@@ -161,6 +161,17 @@ std::vector<int> MemoryPlan::getAllocatedOffsets(const bool overlapOps,
   llvm::DenseSet<Operation *> alreadyVisited;
   if (overlapOps) {
     for (auto o : operations) {
+
+      // For async loads, use the same buffer for load and wait
+      if (llvm::isa<LoadWeightsWaitOp>(o)) {
+        for (int i = 0; i < o->getNumOperands(); i++) {
+          auto inVal = o->getOperand(i);
+          auto outVal = o->getResult(i);
+          vInfo[outVal].firstUsed = vInfo[inVal].firstUsed;
+          inOutMap[inVal] = {outVal, 0};
+        }
+      }
+
       // We iterate through overlappable ops which have not been visited yet
       if (o->hasTrait<OpTrait::xcore::MemoryOverlappable>() &&
           !alreadyVisited.contains(o)) {

diff --git a/xformer/Analysis/MemoryPlan.h b/xformer/Analysis/MemoryPlan.h
@@ -47,6 +47,10 @@ class MemoryPlan {
 
   int getNextBottomOpId(int opId);
 
+  DenseMap<Operation *, size_t> getOperationsIDMap() { return operationIds; }
+
+  std::vector<Operation *> getOperationsSequence() { return operations; }
+
   // OpSplitPlan getOpSplitPlan();
 
   void printMemoryPlan();

diff --git a/xformer/IR/XCoreOps.td b/xformer/IR/XCoreOps.td
@@ -509,15 +509,33 @@ def XC_LoadConstantOp
   let results = (outs AnyTensor : $output);
 }
 
+def XC_LoadWeights_Sync : I32EnumAttrCase<"Sync", 0>;
+def XC_LoadWeights_DDR : I32EnumAttrCase<"DDR", 1>;
+def XC_LoadWeights_Async : I32EnumAttrCase<"Async", 2>;
+def XC_LoadWeights_OpTypeAttr
+    : I32EnumAttr<"LoadWeightsOpType", "op type enum", [
+      XC_LoadWeights_Sync, XC_LoadWeights_DDR, XC_LoadWeights_Async
+    ]>;
+
 def XC_LoadWeightsOp : XC_Op<"ld_weights", [Pure]> {
   let summary = "Load weights op";
 
   let description = [{Load weights op.}];
 
   let arguments = (ins I32Attr
                    : $address, I32ArrayAttr
-                   : $sizes, BoolAttr
-                   : $in_ddr);
+                   : $sizes, StrAttr
+                   : $op_type);
+
+  let results = (outs Variadic<AnyTensor> : $output);
+}
+
+def XC_LoadWeightsWaitOp : XC_Op<"ld_weights_wait", [Pure]> {
+  let summary = "Load weights wait op";
+
+  let description = [{Load weights wait op.}];
+
+  let arguments = (ins Variadic<AnyTensor> : $input);
 
   let results = (outs Variadic<AnyTensor> : $output);
 }

diff --git a/xformer/Transforms/OptimizeConv2D.cpp b/xformer/Transforms/OptimizeConv2D.cpp
@@ -86,7 +86,7 @@ struct ChannelwiseSplitConv2DOutputPattern
       return splitResultType;
     };
 
-    if(!llvm::isa<TFL::QConstOp>(op.getFilter().getDefiningOp()))
+    if (!llvm::isa<TFL::QConstOp>(op.getFilter().getDefiningOp()))
       return failure();
 
     auto filterQConstOp =

diff --git a/xformer/Transforms/Options.h b/xformer/Transforms/Options.h
@@ -15,6 +15,7 @@ extern llvm::cl::opt<unsigned> threadCountOption;
 extern llvm::cl::opt<std::string> weightsFilenameOption;
 extern llvm::cl::opt<unsigned> loadExternallyIfLargerOption;
 extern llvm::cl::opt<bool> weightsAsArrayOption;
+extern llvm::cl::opt<bool> asyncLoadWeightsOption;
 extern llvm::cl::opt<bool> weightsInExternalMemory;
 extern llvm::cl::opt<unsigned> maxLoadExternalSizeOption;
 extern llvm::cl::opt<double> convQuantErrorThresholdOption;

diff --git a/xformer/Transforms/TranslateToCustomOp.cpp b/xformer/Transforms/TranslateToCustomOp.cpp
@@ -15,6 +15,7 @@ namespace mlir::xcore {
 std::vector<uint8_t> Expand8To16Op::buildCustomOptions() { return {}; }
 std::vector<uint8_t> FakeScratchBufferOp::buildCustomOptions() { return {}; }
 std::vector<uint8_t> Bsign8Op::buildCustomOptions() { return {}; }
+std::vector<uint8_t> LoadWeightsWaitOp::buildCustomOptions() { return {}; }
 
 std::vector<uint8_t> UnaryI16Op::buildCustomOptions() {
   flexbuffers::Builder fbb;
@@ -153,13 +154,13 @@ std::vector<uint8_t> ConcatOp::buildCustomOptions() {
 std::vector<uint8_t> LoadWeightsOp::buildCustomOptions() {
   flexbuffers::Builder fbb;
   auto rootMap = fbb.StartMap();
-  fbb.Int("addr", (int32_t)getAddress());
-  auto sizesVec = fbb.StartVector("sizes");
+  fbb.Int("a", (int32_t)getAddress());
+  auto sizesVec = fbb.StartVector("s");
   for (int i = 0; i < getSizes().cast<ArrayAttr>().size(); ++i) {
     fbb.Int(getSizes().cast<ArrayAttr>()[i].cast<IntegerAttr>().getInt());
   }
   fbb.EndVector(sizesVec, false, false);
-  fbb.Bool("ddr", (bool)getInDdr());
+  fbb.Int("t", (int32_t)(symbolizeLoadWeightsOpType(getOpType()).value()));
   fbb.EndMap(rootMap);
   fbb.Finish();
   return fbb.GetBuffer();
@@ -296,6 +297,7 @@ void TranslateToCustomOp::runOnOperation() {
   patterns.insert<RewriteToCustomOp<FakeScratchBufferOp>>(ctx);
   patterns.insert<RewriteToCustomOp<FakeSliceOp>>(ctx);
   patterns.insert<RewriteToCustomOp<Expand8To16Op>>(ctx);
+  patterns.insert<RewriteToCustomOp<LoadWeightsWaitOp>>(ctx);
 
   (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
 }

diff --git a/xformer/Transforms/WriteWeights.cpp b/xformer/Transforms/WriteWeights.cpp
@@ -1,6 +1,7 @@
 // Copyright 2021 XMOS LIMITED. This Software is subject to the terms of the
 // XMOS Public License: Version 1
 
+#include "Analysis/MemoryPlan.h"
 #include "IR/XCoreOps.h"
 #include "Transforms/Options.h"
 #include "Utils/FileIO.h"
@@ -70,6 +71,7 @@ struct WriteWeightsPattern : public OpRewritePattern<LoadConstantOp> {
     // load is not from external memory.
     // External memory loads have to be aligned to 32 bytes/256 bits for max
     // speed
+    LoadWeightsOpType opType = LoadWeightsOpType::Sync;
     if (loadOp.getResult().hasOneUse() && !weightsInExternalMemory) {
       auto use = loadOp->use_begin();
       Operation *ownerOp = use->getOwner();
@@ -93,7 +95,7 @@ struct WriteWeightsPattern : public OpRewritePattern<LoadConstantOp> {
 
       auto loadWeightsOp = rewriter.create<LoadWeightsOp>(
           loadOp.getLoc(), outputTypes, address,
-          rewriter.getArrayAttr(dataSizes), /*in_ddr=*/false);
+          rewriter.getArrayAttr(dataSizes), stringifyLoadWeightsOpType(opType));
 
       for (int i = 0; i < opNums.size(); i++) {
         ownerOp->setOperand(opNums[i], loadWeightsOp.getResult(i));
@@ -111,10 +113,11 @@ struct WriteWeightsPattern : public OpRewritePattern<LoadConstantOp> {
         auto toBePaddedSize = alignedSize - loadOpData.size();
         // Pad with zeros
         tensorData.insert(tensorData.end(), toBePaddedSize, 0);
+        opType = LoadWeightsOpType::DDR;
       }
       auto loadWeightsOp = rewriter.create<LoadWeightsOp>(
           loadOp.getLoc(), loadOp.getType(), address,
-          rewriter.getArrayAttr(dataSizes), /*in_ddr=*/weightsInExternalMemory);
+          rewriter.getArrayAttr(dataSizes), stringifyLoadWeightsOpType(opType));
       rewriter.replaceOp(loadOp, loadWeightsOp.getOutput());
 
       // Find all uses of loadWeightsOp and find the first Owner op
@@ -139,6 +142,34 @@ struct WriteWeightsPattern : public OpRewritePattern<LoadConstantOp> {
   std::vector<std::vector<char>> *tensorsVec_;
 };
 
+struct LowerToAsyncLoadsPattern : public OpRewritePattern<LoadWeightsOp> {
+  LowerToAsyncLoadsPattern(MLIRContext *context)
+      : OpRewritePattern<LoadWeightsOp>(context) {}
+
+  LogicalResult matchAndRewrite(LoadWeightsOp loadWeightsOp,
+                                PatternRewriter &rewriter) const override {
+    if (loadWeightsOp.getOpType() !=
+        stringifyLoadWeightsOpType(LoadWeightsOpType::Sync)) {
+      return failure();
+    }
+
+    // We use loadWeightsOp.getResultTypes() as Load Weights op can have
+    // variadic number of results
+    auto loadWeightsAsyncOp = rewriter.create<LoadWeightsOp>(
+        loadWeightsOp.getLoc(), loadWeightsOp.getResultTypes(),
+        loadWeightsOp.getAddress(), loadWeightsOp.getSizes(),
+        stringifyLoadWeightsOpType(LoadWeightsOpType::Async));
+
+    auto loadWeightsWaitOp = rewriter.create<LoadWeightsWaitOp>(
+        loadWeightsAsyncOp.getLoc(), loadWeightsAsyncOp.getResultTypes(),
+        loadWeightsAsyncOp.getResults());
+
+    rewriter.replaceOp(loadWeightsOp, loadWeightsWaitOp.getOutput());
+
+    return success();
+  }
+};
+
 void WriteWeights::runOnOperation() {
   func::FuncOp f = getOperation();
   if (weightsFilenameOption.empty()) {
@@ -154,8 +185,37 @@ void WriteWeights::runOnOperation() {
   std::vector<std::vector<char>> tensorsVec;
   RewritePatternSet patterns(ctx);
   patterns.insert<WriteWeightsPattern>(&tensorsVec, ctx);
+  if (asyncLoadWeightsOption) {
+    patterns.insert<LowerToAsyncLoadsPattern>(ctx);
+  }
   (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
 
+  // Reorder async load to be before previous convolution
+  // so that the compute can be overlapped with the load
+  auto &m = getAnalysis<MemoryPlan>();
+  auto opIdMap = m.getOperationsIDMap();
+  auto ops = m.getOperationsSequence();
+
+  llvm::SetVector<int> convOpIds;
+  for (auto o : ops) {
+    if (llvm::isa<Conv2DV2Op>(o)) {
+      convOpIds.insert(opIdMap[o]);
+    }
+  }
+
+  for (auto o : ops) {
+    if (llvm::isa<LoadWeightsOp>(o)) {
+      auto ldOp = dyn_cast<LoadWeightsOp>(o);
+      if (ldOp.getOpType() ==
+          stringifyLoadWeightsOpType(LoadWeightsOpType::Async)) {
+        int idx = llvm::lower_bound(convOpIds, opIdMap[o]) - convOpIds.begin();
+        if (idx > 0) {
+          o->moveBefore(ops[convOpIds[idx - 1]]);
+        }
+      }
+    }
+  }
+
   if (failed(utils::writeWeightsToFile(weightsFilenameOption, tensorsVec,
                                        weightsAsArrayOption,
                                        weightsInExternalMemory))) {

diff --git a/xformer/XCoreOptMain.cpp b/xformer/XCoreOptMain.cpp
@@ -85,6 +85,12 @@ cl::opt<bool> weightsInExternalMemory(
              "it in external memory."),
     cl::init(false), cl::cat(XformerCategory));
 
+cl::opt<bool> asyncLoadWeightsOption(
+    "xcore-async-load-weights",
+    cl::desc("Enable loading weights from flash asynchronously. This does not "
+             "affect loads from external memory."),
+    cl::init(false), cl::cat(XformerCategory));
+
 cl::opt<unsigned> loadExternallyIfLargerOption(
     "xcore-load-externally-if-larger",
     cl::desc("Load constants externally if larger than given limit in bytes "
@@ -470,8 +476,14 @@ int main(int argc, char **argv) {
   if (mlir::xcore::weightsInExternalMemory.getNumOccurrences() > 0 &&
       mlir::xcore::weightsAsArrayOption.getNumOccurrences() == 0) {
     return failedMessage(
-        "Please specify the xcore-write-weights-as-array"
-        "when using the xcore-weights-in-external-memory option!");
+        "Please specify xcore-write-weights-as-array"
+        " when using the xcore-weights-in-external-memory option!");
+  }
+
+  if (mlir::xcore::weightsInExternalMemory.getNumOccurrences() > 0 &&
+      mlir::xcore::asyncLoadWeightsOption.getNumOccurrences() > 0) {
+    return failedMessage("Please don't specify xcore-weights-in-external-memory"
+                         " when using the xcore-async-load-weights option!");
   }
 
   if (mlir::xcore::loadExternallyIfLargerOption.getNumOccurrences() > 0 &&

diff --git a/xformer/lib_tflite_micro.BUILD b/xformer/lib_tflite_micro.BUILD
@@ -30,7 +30,7 @@ filegroup(
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_conv2d_v2.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_maxpool2d.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_detection_post.cc",
-        "lib_tflite_micro/src/tflite-xcore-kernels/xcore_load_weights.cc",
+        "lib_tflite_micro/src/tflite-xcore-kernels/xcore_load_weights_wait.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_lookup.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_softmax.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_batched_softmax.cc",
+3 −2		cmakefiles/xtflm.cmake
+ −		host_cmd_line_interpreter/tests/test_mobnet/model_mobilenet_v1.tflite
+6 −0		lib_tflite_micro/api/fast_flash.h
+2 −3		lib_tflite_micro/api/flash_server.h
+64 −0		lib_tflite_micro/api/load_weights.h
+177 −0		lib_tflite_micro/src/flash_server.c
+0 −80		lib_tflite_micro/src/flash_server.xc
+78 −0		lib_tflite_micro/src/load_weights.c
+54 −63		lib_tflite_micro/src/tflite-xcore-kernels/xcore_load_weights_wait.cc
+1 −0		lib_tflite_micro/src/tflite-xcore-kernels/xcore_ops.cc
+2 −0		lib_tflite_micro/src/tflite-xcore-kernels/xcore_ops.h
+0 −2		lib_tflite_micro/src/tile_ram_server.c
+1 −1		repos.list
+26 −1		tflite_micro_compiler/src/Compiler.cc
+1 −0		tflite_micro_compiler/src/Compiler.h