Merge pull request #945 from xmos/allocate_same_tensors

Add changes for allocating same offset for input and output tensors
xmos · Dec 16, 2024 · 373261d · 373261d
2 parents bc0b994 + de13620
commit 373261d
Show file tree

Hide file tree

Showing 8 changed files with 358 additions and 15 deletions.
diff --git a/third_party/lib_tflite_micro b/third_party/lib_tflite_micro
diff --git a/xformer/Analysis/MemoryPlan.cpp b/xformer/Analysis/MemoryPlan.cpp
@@ -129,7 +129,9 @@ int MemoryPlan::getOffset(Value v, int size,
 
     if ((valueInfo[allocatedVal].firstUsed > valueInfo[v].lastUsed) ||
         (valueInfo[v].firstUsed > valueInfo[allocatedVal].lastUsed)) {
-      // No overlap
+      // There is no overlap with this buffer. We move on until we have a clash.
+      // When there is a clash, we know we can allocate before that one if there
+      // is space as we don't overlap with any of those buffers.
       continue;
     }
 
@@ -149,6 +151,70 @@ int MemoryPlan::getOffset(Value v, int size,
   return offset;
 }
 
+void MemoryPlan::buildInputOutputTensorMaps(
+    llvm::StringMap<Value> &inputTensorMap,
+    llvm::StringMap<Value> &outputTensorMap) {
+  auto buildMap = [&](StringRef argAttr, StringRef nameAttr,
+                      llvm::SmallVector<std::string> &attrsInOrder) {
+    llvm::StringMap<std::string> map;
+    llvm::SmallVector<std::string> argNames;
+    auto funcOp = dyn_cast<func::FuncOp>(op);
+
+    llvm::SmallVector<llvm::StringRef, 2> inputNames;
+    auto dictAttr =
+        funcOp->getAttrOfType<mlir::DictionaryAttr>("tf.entry_function");
+    if (auto str =
+            dictAttr.get(nameAttr).dyn_cast_or_null<mlir::StringAttr>()) {
+      str.getValue().split(inputNames, ',', /*MaxSplit=*/-1,
+                           /*KeepEmpty=*/false);
+    }
+
+    auto argAttrs = funcOp->getAttrOfType<mlir::ArrayAttr>(argAttr);
+    if (argAttrs) {
+      for (auto attr : argAttrs) {
+        auto d = attr.dyn_cast_or_null<mlir::DictionaryAttr>();
+
+        const ArrayRef<Attribute> indexPathAttrs =
+            d.get("tf_saved_model.index_path").cast<ArrayAttr>().getValue();
+        auto stringAttr =
+            indexPathAttrs[0].dyn_cast_or_null<mlir::StringAttr>();
+        if (!stringAttr)
+          continue;
+        argNames.push_back(stringAttr.getValue().str());
+      }
+    } else {
+      for (int i = 0; i < inputNames.size(); i++) {
+        argNames.push_back(inputNames[i].str());
+      }
+    }
+
+    assert(argNames.size() == inputNames.size());
+    for (int i = 0; i < inputNames.size(); i++) {
+      map[inputNames[i].str()] = argNames[i];
+      attrsInOrder.push_back(argNames[i]);
+    }
+    return map;
+  };
+
+  llvm::StringMap<std::string> inNameToAttrMap, outNameToAttrMap;
+  llvm::SmallVector<std::string> attrsInOrder;
+
+  inNameToAttrMap = buildMap("arg_attrs", "inputs", attrsInOrder);
+  outNameToAttrMap = buildMap("res_attrs", "outputs", attrsInOrder);
+
+  for (int i = 0; i < inNameToAttrMap.size(); i++) {
+    inputTensorMap[attrsInOrder[i]] = values[i];
+  }
+
+  for (auto v : values) {
+    if (auto loc = v.getLoc()->dyn_cast_or_null<NameLoc>()) {
+      if (outNameToAttrMap.count(loc.getName())) {
+        outputTensorMap[outNameToAttrMap[loc.getName()]] = v;
+      }
+    }
+  }
+}
+
 std::vector<int> MemoryPlan::getAllocatedOffsets(const bool overlapOps,
                                                  int &peakMemoryUsed,
                                                  int &peakOpId) {
@@ -245,6 +311,22 @@ std::vector<int> MemoryPlan::getAllocatedOffsets(const bool overlapOps,
     }
   }
 
+  // Handle input output tensor same allocations
+  llvm::DenseSet<Value> inputTensorSet;
+  llvm::DenseSet<Value> outputTensorSet;
+  llvm::StringMap<Value> inputTensorMap, outputTensorMap;
+
+  if (sameAllocationInputOutputTensorOption.size() > 0) {
+    buildInputOutputTensorMaps(inputTensorMap, outputTensorMap);
+    for (int i = 0; i < sameAllocationInputOutputTensorOption.size();
+         i = i + 2) {
+      inputTensorSet.insert(
+          inputTensorMap[sameAllocationInputOutputTensorOption[i]]);
+      outputTensorSet.insert(
+          outputTensorMap[sameAllocationInputOutputTensorOption[i + 1]]);
+    }
+  }
+
   // The comparator keeps the buffers ordered by id if their sizes are the
   // same
   auto DecreasingSizesComparator = [&](QueueItem &lhs, QueueItem &rhs) {
@@ -259,23 +341,51 @@ std::vector<int> MemoryPlan::getAllocatedOffsets(const bool overlapOps,
       queue(DecreasingSizesComparator);
 
   // Insert values and their sizes into priority queue
+  // InOutmap prevents adding in values which are overlapped
+  // In a chain of overlapped values, only the last value is allocated and the
+  // rest are patched up and add in allocated values list later
+  // Don't insert same allocation input and output tensors into queue as they
+  // are allocated separately
   for (auto v : values) {
-    if (!inOutMap.count(v) && !vInfo[v].isConstant) {
+    if (!inOutMap.count(v) && !vInfo[v].isConstant &&
+        !outputTensorSet.contains(v) && !inputTensorSet.contains(v)) {
       queue.push({v, vInfo[v].size});
     }
   }
 
   ValuesOrderedByOffset allocatedValues;
-  auto v = queue.top().first;
-  queue.pop();
-  allocatedValues.insert({v, 0});
+
+  // If there are same allocation input and output tensors, allocate those first
+  if (sameAllocationInputOutputTensorOption.size() > 0) {
+    // Allocate first input and output tensor with offsets of zero
+    allocatedValues.insert(
+        {inputTensorMap[sameAllocationInputOutputTensorOption[0]], 0});
+    allocatedValues.insert(
+        {outputTensorMap[sameAllocationInputOutputTensorOption[1]], 0});
+
+    for (int i = 2; i < sameAllocationInputOutputTensorOption.size();
+         i = i + 2) {
+      auto inputTensor =
+          inputTensorMap[sameAllocationInputOutputTensorOption[i]];
+      int newOffset = getOffset(inputTensor, vInfo[inputTensor].size, vInfo,
+                                allocatedValues);
+      allocatedValues.insert({inputTensor, newOffset});
+      allocatedValues.insert(
+          {outputTensorMap[sameAllocationInputOutputTensorOption[i + 1]],
+           newOffset});
+    }
+  } else {
+    // Else allocate the largest tensor at offset zero
+    auto v = queue.top().first;
+    queue.pop();
+    allocatedValues.insert({v, 0});
+  }
 
   while (!queue.empty()) {
     auto v = queue.top().first;
     auto size = queue.top().second;
     queue.pop();
 
-    // check with allocatedValues list
     int newOffset = getOffset(v, size, vInfo, allocatedValues);
     allocatedValues.insert({v, newOffset});
   }
@@ -313,6 +423,37 @@ std::vector<int> MemoryPlan::getAllocatedOffsets(const bool overlapOps,
     allocatedValuesOrderedByID.insert(i);
   }
 
+  // Check if buffers clash
+  // for (auto i : allocatedValuesOrderedByID) {
+  //   for (auto j : allocatedValuesOrderedByID) {
+  //     if (vInfo[i.first].id < vInfo[j.first].id) {
+  //       if ((vInfo[i.first].firstUsed > vInfo[j.first].firstUsed &&
+  //            vInfo[i.first].firstUsed < vInfo[j.first].lastUsed) ||
+  //           (vInfo[j.first].firstUsed > vInfo[i.first].firstUsed &&
+  //            vInfo[j.first].firstUsed < vInfo[i.first].lastUsed)) {
+  //         auto iBegin = i.second;
+  //         auto iEnd = i.second + vInfo[i.first].size;
+  //         auto jBegin = j.second;
+  //         auto jEnd = j.second + vInfo[j.first].size;
+  //         if ((iBegin > jBegin && iBegin < jEnd) ||
+  //             (jBegin > iBegin && jBegin < iEnd)) {
+  //           printf("\n\nProblem!");
+  //           std::cout << "\nValue one " << vInfo[i.first].id
+  //                     << ", size = " << vInfo[i.first].size
+  //                     << ", offset = " << i.second
+  //                     << ", first = " << vInfo[i.first].firstUsed
+  //                     << ", last = " << vInfo[i.first].lastUsed;
+  //           std::cout << "\nValue two " << vInfo[j.first].id
+  //                     << ", size = " << vInfo[j.first].size
+  //                     << ", offset = " << j.second
+  //                     << ", first = " << vInfo[j.first].firstUsed
+  //                     << ", last = " << vInfo[j.first].lastUsed;
+  //         }
+  //       }
+  //     }
+  //   }
+  // }
+
   size_t peakUsed = 0;
   size_t peakUsedValueID = 0;
   size_t maxId = 0;

diff --git a/xformer/Analysis/MemoryPlan.h b/xformer/Analysis/MemoryPlan.h
@@ -7,12 +7,21 @@
 #include "mlir/Analysis/Liveness.h"
 #include "mlir/IR/Value.h"
 #include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/StringMap.h"
 
 #include <set>
 
 namespace mlir {
 namespace xcore {
 
+struct ValueInfo {
+  size_t id;
+  size_t size;
+  bool isConstant;
+  int firstUsed;
+  int lastUsed;
+};
+
 // Represents an analysis for memory planning of a given FuncOp for a model.
 // - Uses liveness analysis and a greedy algorithm to arrange buffers in memory.
 // - Tries to overlap input and output buffers based on the op characteristics.
@@ -51,6 +60,11 @@ class MemoryPlan {
 
   std::vector<Operation *> getOperationsSequence() { return operations; }
 
+  DenseMap<Value, ValueInfo> getValuesInfoMap() { return valueInfo; }
+
+  void buildInputOutputTensorMaps(llvm::StringMap<Value> &inputTensorMap,
+                                  llvm::StringMap<Value> &outputTensorMap);
+
   // OpSplitPlan getOpSplitPlan();
 
   void printMemoryPlan();
@@ -70,14 +84,6 @@ class MemoryPlan {
   using ValuesOrderedByOffset =
       std::multiset<QueueItem, IncreasingOffsetsComparator>;
 
-  struct ValueInfo {
-    size_t id;
-    size_t size;
-    bool isConstant;
-    int firstUsed;
-    int lastUsed;
-  };
-
   int getOffset(Value v, int size, DenseMap<Value, ValueInfo> &valueInfo,
                 ValuesOrderedByOffset &allocatedOffsets);
 

diff --git a/xformer/Transforms/Options.h b/xformer/Transforms/Options.h
@@ -32,6 +32,8 @@ extern llvm::cl::opt<bool> convDebugOption;
 extern llvm::cl::opt<bool> overlapConvOption;
 extern llvm::cl::opt<bool> offlineOffsetsOption;
 extern llvm::cl::opt<unsigned> convChannelwiseSplitSizeOption;
+extern llvm::cl::list<std::string> sameAllocationInputOutputTensorOption;
+
 } // namespace xcore
 } // namespace mlir
 

diff --git a/xformer/Transforms/Passes.cpp b/xformer/Transforms/Passes.cpp
@@ -15,6 +15,7 @@ void buildXCorePreOpSplitPassPipeline(OpPassManager &pm) {
   // Run pass from LCE to convert Larq ops which are in TFL custom op format to
   // Larq dialect
   pm.addPass(mlir::TFL::CreateTranslateToLCEPass());
+  pm.addPass(createVerifySameAllocationTensorsPass());
   // Convert dynamic shapes in batch dimension to static
   pm.addPass(createRemoveDynamicShapePass());
 }

diff --git a/xformer/Transforms/Passes.h b/xformer/Transforms/Passes.h
@@ -30,6 +30,8 @@ std::unique_ptr<OperationPass<func::FuncOp>> createReplaceFCWithConv2DPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createOptimizeConv2DPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createOpSplitPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createApplyTFLPatternsPass();
+std::unique_ptr<OperationPass<func::FuncOp>>
+createVerifySameAllocationTensorsPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createRemoveDynamicShapePass();
 std::unique_ptr<OperationPass<func::FuncOp>> createReplaceAddSubPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createReplaceMulPass();