diff --git a/third_party/lib_tflite_micro b/third_party/lib_tflite_micro index 6a387b04c..a0e68eea3 160000 --- a/third_party/lib_tflite_micro +++ b/third_party/lib_tflite_micro @@ -1 +1 @@ -Subproject commit 6a387b04c20602383ab9af903de5092290d70091 +Subproject commit a0e68eea3892c00afe510bd8de54ff45d9eba53b diff --git a/xformer/Analysis/MemoryPlan.cpp b/xformer/Analysis/MemoryPlan.cpp index 9a72a8094..96236b942 100644 --- a/xformer/Analysis/MemoryPlan.cpp +++ b/xformer/Analysis/MemoryPlan.cpp @@ -129,7 +129,9 @@ int MemoryPlan::getOffset(Value v, int size, if ((valueInfo[allocatedVal].firstUsed > valueInfo[v].lastUsed) || (valueInfo[v].firstUsed > valueInfo[allocatedVal].lastUsed)) { - // No overlap + // There is no overlap with this buffer. We move on until we have a clash. + // When there is a clash, we know we can allocate before that one if there + // is space as we don't overlap with any of those buffers. continue; } @@ -149,6 +151,70 @@ int MemoryPlan::getOffset(Value v, int size, return offset; } +void MemoryPlan::buildInputOutputTensorMaps( + llvm::StringMap &inputTensorMap, + llvm::StringMap &outputTensorMap) { + auto buildMap = [&](StringRef argAttr, StringRef nameAttr, + llvm::SmallVector &attrsInOrder) { + llvm::StringMap map; + llvm::SmallVector argNames; + auto funcOp = dyn_cast(op); + + llvm::SmallVector inputNames; + auto dictAttr = + funcOp->getAttrOfType("tf.entry_function"); + if (auto str = + dictAttr.get(nameAttr).dyn_cast_or_null()) { + str.getValue().split(inputNames, ',', /*MaxSplit=*/-1, + /*KeepEmpty=*/false); + } + + auto argAttrs = funcOp->getAttrOfType(argAttr); + if (argAttrs) { + for (auto attr : argAttrs) { + auto d = attr.dyn_cast_or_null(); + + const ArrayRef indexPathAttrs = + d.get("tf_saved_model.index_path").cast().getValue(); + auto stringAttr = + indexPathAttrs[0].dyn_cast_or_null(); + if (!stringAttr) + continue; + argNames.push_back(stringAttr.getValue().str()); + } + } else { + for (int i = 0; i < inputNames.size(); i++) { + argNames.push_back(inputNames[i].str()); + } + } + + assert(argNames.size() == inputNames.size()); + for (int i = 0; i < inputNames.size(); i++) { + map[inputNames[i].str()] = argNames[i]; + attrsInOrder.push_back(argNames[i]); + } + return map; + }; + + llvm::StringMap inNameToAttrMap, outNameToAttrMap; + llvm::SmallVector attrsInOrder; + + inNameToAttrMap = buildMap("arg_attrs", "inputs", attrsInOrder); + outNameToAttrMap = buildMap("res_attrs", "outputs", attrsInOrder); + + for (int i = 0; i < inNameToAttrMap.size(); i++) { + inputTensorMap[attrsInOrder[i]] = values[i]; + } + + for (auto v : values) { + if (auto loc = v.getLoc()->dyn_cast_or_null()) { + if (outNameToAttrMap.count(loc.getName())) { + outputTensorMap[outNameToAttrMap[loc.getName()]] = v; + } + } + } +} + std::vector MemoryPlan::getAllocatedOffsets(const bool overlapOps, int &peakMemoryUsed, int &peakOpId) { @@ -245,6 +311,22 @@ std::vector MemoryPlan::getAllocatedOffsets(const bool overlapOps, } } + // Handle input output tensor same allocations + llvm::DenseSet inputTensorSet; + llvm::DenseSet outputTensorSet; + llvm::StringMap inputTensorMap, outputTensorMap; + + if (sameAllocationInputOutputTensorOption.size() > 0) { + buildInputOutputTensorMaps(inputTensorMap, outputTensorMap); + for (int i = 0; i < sameAllocationInputOutputTensorOption.size(); + i = i + 2) { + inputTensorSet.insert( + inputTensorMap[sameAllocationInputOutputTensorOption[i]]); + outputTensorSet.insert( + outputTensorMap[sameAllocationInputOutputTensorOption[i + 1]]); + } + } + // The comparator keeps the buffers ordered by id if their sizes are the // same auto DecreasingSizesComparator = [&](QueueItem &lhs, QueueItem &rhs) { @@ -259,23 +341,51 @@ std::vector MemoryPlan::getAllocatedOffsets(const bool overlapOps, queue(DecreasingSizesComparator); // Insert values and their sizes into priority queue + // InOutmap prevents adding in values which are overlapped + // In a chain of overlapped values, only the last value is allocated and the + // rest are patched up and add in allocated values list later + // Don't insert same allocation input and output tensors into queue as they + // are allocated separately for (auto v : values) { - if (!inOutMap.count(v) && !vInfo[v].isConstant) { + if (!inOutMap.count(v) && !vInfo[v].isConstant && + !outputTensorSet.contains(v) && !inputTensorSet.contains(v)) { queue.push({v, vInfo[v].size}); } } ValuesOrderedByOffset allocatedValues; - auto v = queue.top().first; - queue.pop(); - allocatedValues.insert({v, 0}); + + // If there are same allocation input and output tensors, allocate those first + if (sameAllocationInputOutputTensorOption.size() > 0) { + // Allocate first input and output tensor with offsets of zero + allocatedValues.insert( + {inputTensorMap[sameAllocationInputOutputTensorOption[0]], 0}); + allocatedValues.insert( + {outputTensorMap[sameAllocationInputOutputTensorOption[1]], 0}); + + for (int i = 2; i < sameAllocationInputOutputTensorOption.size(); + i = i + 2) { + auto inputTensor = + inputTensorMap[sameAllocationInputOutputTensorOption[i]]; + int newOffset = getOffset(inputTensor, vInfo[inputTensor].size, vInfo, + allocatedValues); + allocatedValues.insert({inputTensor, newOffset}); + allocatedValues.insert( + {outputTensorMap[sameAllocationInputOutputTensorOption[i + 1]], + newOffset}); + } + } else { + // Else allocate the largest tensor at offset zero + auto v = queue.top().first; + queue.pop(); + allocatedValues.insert({v, 0}); + } while (!queue.empty()) { auto v = queue.top().first; auto size = queue.top().second; queue.pop(); - // check with allocatedValues list int newOffset = getOffset(v, size, vInfo, allocatedValues); allocatedValues.insert({v, newOffset}); } @@ -313,6 +423,37 @@ std::vector MemoryPlan::getAllocatedOffsets(const bool overlapOps, allocatedValuesOrderedByID.insert(i); } + // Check if buffers clash + // for (auto i : allocatedValuesOrderedByID) { + // for (auto j : allocatedValuesOrderedByID) { + // if (vInfo[i.first].id < vInfo[j.first].id) { + // if ((vInfo[i.first].firstUsed > vInfo[j.first].firstUsed && + // vInfo[i.first].firstUsed < vInfo[j.first].lastUsed) || + // (vInfo[j.first].firstUsed > vInfo[i.first].firstUsed && + // vInfo[j.first].firstUsed < vInfo[i.first].lastUsed)) { + // auto iBegin = i.second; + // auto iEnd = i.second + vInfo[i.first].size; + // auto jBegin = j.second; + // auto jEnd = j.second + vInfo[j.first].size; + // if ((iBegin > jBegin && iBegin < jEnd) || + // (jBegin > iBegin && jBegin < iEnd)) { + // printf("\n\nProblem!"); + // std::cout << "\nValue one " << vInfo[i.first].id + // << ", size = " << vInfo[i.first].size + // << ", offset = " << i.second + // << ", first = " << vInfo[i.first].firstUsed + // << ", last = " << vInfo[i.first].lastUsed; + // std::cout << "\nValue two " << vInfo[j.first].id + // << ", size = " << vInfo[j.first].size + // << ", offset = " << j.second + // << ", first = " << vInfo[j.first].firstUsed + // << ", last = " << vInfo[j.first].lastUsed; + // } + // } + // } + // } + // } + size_t peakUsed = 0; size_t peakUsedValueID = 0; size_t maxId = 0; diff --git a/xformer/Analysis/MemoryPlan.h b/xformer/Analysis/MemoryPlan.h index 49c463145..fde7f3248 100644 --- a/xformer/Analysis/MemoryPlan.h +++ b/xformer/Analysis/MemoryPlan.h @@ -7,12 +7,21 @@ #include "mlir/Analysis/Liveness.h" #include "mlir/IR/Value.h" #include "llvm/ADT/PriorityQueue.h" +#include "llvm/ADT/StringMap.h" #include namespace mlir { namespace xcore { +struct ValueInfo { + size_t id; + size_t size; + bool isConstant; + int firstUsed; + int lastUsed; +}; + // Represents an analysis for memory planning of a given FuncOp for a model. // - Uses liveness analysis and a greedy algorithm to arrange buffers in memory. // - Tries to overlap input and output buffers based on the op characteristics. @@ -51,6 +60,11 @@ class MemoryPlan { std::vector getOperationsSequence() { return operations; } + DenseMap getValuesInfoMap() { return valueInfo; } + + void buildInputOutputTensorMaps(llvm::StringMap &inputTensorMap, + llvm::StringMap &outputTensorMap); + // OpSplitPlan getOpSplitPlan(); void printMemoryPlan(); @@ -70,14 +84,6 @@ class MemoryPlan { using ValuesOrderedByOffset = std::multiset; - struct ValueInfo { - size_t id; - size_t size; - bool isConstant; - int firstUsed; - int lastUsed; - }; - int getOffset(Value v, int size, DenseMap &valueInfo, ValuesOrderedByOffset &allocatedOffsets); diff --git a/xformer/Transforms/Options.h b/xformer/Transforms/Options.h index b200309c5..046e7e32c 100644 --- a/xformer/Transforms/Options.h +++ b/xformer/Transforms/Options.h @@ -32,6 +32,8 @@ extern llvm::cl::opt convDebugOption; extern llvm::cl::opt overlapConvOption; extern llvm::cl::opt offlineOffsetsOption; extern llvm::cl::opt convChannelwiseSplitSizeOption; +extern llvm::cl::list sameAllocationInputOutputTensorOption; + } // namespace xcore } // namespace mlir diff --git a/xformer/Transforms/Passes.cpp b/xformer/Transforms/Passes.cpp index 6acce3f26..11aff6de7 100644 --- a/xformer/Transforms/Passes.cpp +++ b/xformer/Transforms/Passes.cpp @@ -15,6 +15,7 @@ void buildXCorePreOpSplitPassPipeline(OpPassManager &pm) { // Run pass from LCE to convert Larq ops which are in TFL custom op format to // Larq dialect pm.addPass(mlir::TFL::CreateTranslateToLCEPass()); + pm.addPass(createVerifySameAllocationTensorsPass()); // Convert dynamic shapes in batch dimension to static pm.addPass(createRemoveDynamicShapePass()); } diff --git a/xformer/Transforms/Passes.h b/xformer/Transforms/Passes.h index dfdc15a7e..95eb57dda 100644 --- a/xformer/Transforms/Passes.h +++ b/xformer/Transforms/Passes.h @@ -30,6 +30,8 @@ std::unique_ptr> createReplaceFCWithConv2DPass(); std::unique_ptr> createOptimizeConv2DPass(); std::unique_ptr> createOpSplitPass(); std::unique_ptr> createApplyTFLPatternsPass(); +std::unique_ptr> +createVerifySameAllocationTensorsPass(); std::unique_ptr> createRemoveDynamicShapePass(); std::unique_ptr> createReplaceAddSubPass(); std::unique_ptr> createReplaceMulPass(); diff --git a/xformer/Transforms/VerifySameAllocationTensors.cpp b/xformer/Transforms/VerifySameAllocationTensors.cpp new file mode 100644 index 000000000..7f9f0a424 --- /dev/null +++ b/xformer/Transforms/VerifySameAllocationTensors.cpp @@ -0,0 +1,166 @@ +// Copyright 2021 XMOS LIMITED. This Software is subject to the terms of the +// XMOS Public License: Version 1 + +#include "Analysis/MemoryPlan.h" +#include "IR/XCoreOps.h" +#include "Transforms/Options.h" +#include "Utils/Util.h" + +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h" + +namespace mlir::xcore { + +namespace { +struct VerifySameAllocationTensors + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(VerifySameAllocationTensors) + + void getDependentDialects(DialectRegistry ®istry) const final { + registry.insert(); + } + StringRef getArgument() const final { return "xcore-preset-allocations"; } + StringRef getDescription() const final { return "Remove dynamic shape"; } + void runOnOperation() override; +}; + +void VerifySameAllocationTensors::runOnOperation() { + auto func = getOperation(); + auto *ctx = &getContext(); + + if (sameAllocationInputOutputTensorOption.size() > 0) { + + auto &m = getAnalysis(); + llvm::StringMap inputTensorMap, outputTensorMap; + m.buildInputOutputTensorMaps(inputTensorMap, outputTensorMap); + + bool failed = false; + // Check names of input and output tensors + for (int i = 0; i < sameAllocationInputOutputTensorOption.size(); + i = i + 2) { + if (!inputTensorMap.count(sameAllocationInputOutputTensorOption[i])) { + func.emitError() + << sameAllocationInputOutputTensorOption[i] + << " not present in input tensors. Please check the name!"; + failed = true; + } + if (!outputTensorMap.count( + sameAllocationInputOutputTensorOption[i + 1])) { + func.emitError() + << sameAllocationInputOutputTensorOption[i + 1] + << " not present in output tensors. Please check the name!"; + failed = true; + } + } + + if (failed) { + signalPassFailure(); + return; + } + + // Check sizes + auto vInfo = m.getValuesInfoMap(); + for (int i = 0; i < sameAllocationInputOutputTensorOption.size(); + i = i + 2) { + if (vInfo[inputTensorMap[sameAllocationInputOutputTensorOption[i]]] + .size != + vInfo[outputTensorMap[sameAllocationInputOutputTensorOption[i + 1]]] + .size) { + func.emitError() << "Size of input tensor " + << sameAllocationInputOutputTensorOption[i] + << " is not equal to output tensor " + << sameAllocationInputOutputTensorOption[i + 1] + << ". Please check!"; + failed = true; + } + } + + // Check quantization + for (int i = 0; i < sameAllocationInputOutputTensorOption.size(); + i = i + 2) { + auto inQType = dyn_cast_or_null( + inputTensorMap[sameAllocationInputOutputTensorOption[i]] + .getType() + .cast() + .getElementType()); + auto outQType = dyn_cast_or_null( + outputTensorMap[sameAllocationInputOutputTensorOption[i + 1]] + .getType() + .cast() + .getElementType()); + if (inQType && !outQType) { + func.emitError() << "Input tensor " + << sameAllocationInputOutputTensorOption[i] + << " is quantized, but " + << sameAllocationInputOutputTensorOption[i + 1] + << " is not. Please check!"; + failed = true; + } else if (!inQType && outQType) { + func.emitError() << "Input tensor " + << sameAllocationInputOutputTensorOption[i] + << " is not quantized, but " + << sameAllocationInputOutputTensorOption[i + 1] + << " is quantized. Please check!"; + failed = true; + } else if (inQType && outQType) { + // Both are quantized, but check element sizes, maybe i8 and i16 + + auto inScale = inQType.getScale(); + auto inZeroPoint = inQType.getZeroPoint(); + + auto outScale = outQType.getScale(); + auto outZeroPoint = outQType.getZeroPoint(); + if (inScale != outScale || inZeroPoint != outZeroPoint) { + // change input block arg to output quantization + + // insert quantize op to convert back to original input quantization + // auto module = func->getParentOfType(); + // OpBuilder builder(module); + // auto outVal = + // outputTensorMap[sameAllocationInputOutputTensorOption[i + // + 1]]; auto newQType = inQType.castFromExpressedType( + // quant::QuantizedType::castToExpressedType(outVal.getType())); + // auto newQuantizeOp = builder.create( + // inVal.getLoc(), newQType, outVal, TypeAttr::get(inQType)); + + auto inVal = inputTensorMap[sameAllocationInputOutputTensorOption[i]]; + auto typeNumBits = + utils::getTypeSize( + inVal.getType().cast().getElementType()) * + 8; + double maxError = 1.0 / (2 << (typeNumBits - 1)); + if (abs(inScale - outScale) > maxError) { + func.emitError() + << "Input tensor " << sameAllocationInputOutputTensorOption[i] + << " has scale of " << inScale << " and zeropoint of " + << inZeroPoint << ", but output tensor " + << sameAllocationInputOutputTensorOption[i + 1] + << " has scale of " << outScale << " and zeropoint of " + << outZeroPoint << ". Please check!"; + failed = true; + } + } + } else if (!inQType && !outQType) { + // Both are not quantized, but check element sizes, maybe i8 and i16 + } + } + + if (failed) { + signalPassFailure(); + return; + } + } +} +} // namespace + +// Creates an instance of the VerifySameAllocationTensors pass. +std::unique_ptr> +createVerifySameAllocationTensorsPass() { + return std::make_unique(); +} + +static PassRegistration pass; + +} // namespace mlir::xcore diff --git a/xformer/XCoreOptMain.cpp b/xformer/XCoreOptMain.cpp index 63310d300..26cadeaba 100644 --- a/xformer/XCoreOptMain.cpp +++ b/xformer/XCoreOptMain.cpp @@ -36,6 +36,13 @@ namespace mlir::xcore { // and -help) will be hidden. static cl::OptionCategory XformerCategory("Xformer options"); +llvm::cl::list sameAllocationInputOutputTensorOption( + "xcore-same-allocation-input-output-tensor", + cl::desc("Allocate this input and output tensor in the same memory " + "location. This helps avoiding a memcopy from output to input in " + "case of recurrent networks. The first tensor must be the input."), + cl::CommaSeparated, cl::cat(XformerCategory)); + cl::opt enableMemoryAnalysisOption( "xcore-run-memory-analysis", cl::desc("Run memory analysis to aid in operation splitting."), @@ -507,6 +514,24 @@ int main(int argc, char **argv) { return failedMessage("Please specify a thread count between one and five!"); } + llvm::DenseMap positionCountMap; + for (int i = 0; i < mlir::xcore::sameAllocationInputOutputTensorOption.size(); + i++) { + int pos = mlir::xcore::sameAllocationInputOutputTensorOption.getPosition(i); + if (positionCountMap.count(pos)) { + positionCountMap[pos]++; + } else { + positionCountMap[pos] = 1; + } + } + for (auto i : positionCountMap) { + if (i.second != 2) { + return failedMessage( + "Please specify two tensors, an input tensor and output tensor for " + "each of xcore-same-allocation-input-output-tensor options!"); + } + } + if (failed(isCompatibleVersion( versionLibTfliteMicro, lib_tflite_micro::major_version, lib_tflite_micro::minor_version, lib_tflite_micro::patch_version))) {