Skip to content

Commit

Permalink
Merge pull request #943 from xmos/async_flash
Browse files Browse the repository at this point in the history
Async flash loads
  • Loading branch information
panickal-xmos authored Dec 1, 2024
2 parents f60ef11 + 5cf1f6b commit 0a63d97
Show file tree
Hide file tree
Showing 10 changed files with 120 additions and 12 deletions.
11 changes: 11 additions & 0 deletions xformer/Analysis/MemoryPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,17 @@ std::vector<int> MemoryPlan::getAllocatedOffsets(const bool overlapOps,
llvm::DenseSet<Operation *> alreadyVisited;
if (overlapOps) {
for (auto o : operations) {

// For async loads, use the same buffer for load and wait
if (llvm::isa<LoadWeightsWaitOp>(o)) {
for (int i = 0; i < o->getNumOperands(); i++) {
auto inVal = o->getOperand(i);
auto outVal = o->getResult(i);
vInfo[outVal].firstUsed = vInfo[inVal].firstUsed;
inOutMap[inVal] = {outVal, 0};
}
}

// We iterate through overlappable ops which have not been visited yet
if (o->hasTrait<OpTrait::xcore::MemoryOverlappable>() &&
!alreadyVisited.contains(o)) {
Expand Down
4 changes: 4 additions & 0 deletions xformer/Analysis/MemoryPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ class MemoryPlan {

int getNextBottomOpId(int opId);

DenseMap<Operation *, size_t> getOperationsIDMap() { return operationIds; }

std::vector<Operation *> getOperationsSequence() { return operations; }

// OpSplitPlan getOpSplitPlan();

void printMemoryPlan();
Expand Down
22 changes: 20 additions & 2 deletions xformer/IR/XCoreOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -509,15 +509,33 @@ def XC_LoadConstantOp
let results = (outs AnyTensor : $output);
}

def XC_LoadWeights_Sync : I32EnumAttrCase<"Sync", 0>;
def XC_LoadWeights_DDR : I32EnumAttrCase<"DDR", 1>;
def XC_LoadWeights_Async : I32EnumAttrCase<"Async", 2>;
def XC_LoadWeights_OpTypeAttr
: I32EnumAttr<"LoadWeightsOpType", "op type enum", [
XC_LoadWeights_Sync, XC_LoadWeights_DDR, XC_LoadWeights_Async
]>;

def XC_LoadWeightsOp : XC_Op<"ld_weights", [Pure]> {
let summary = "Load weights op";

let description = [{Load weights op.}];

let arguments = (ins I32Attr
: $address, I32ArrayAttr
: $sizes, BoolAttr
: $in_ddr);
: $sizes, StrAttr
: $op_type);

let results = (outs Variadic<AnyTensor> : $output);
}

def XC_LoadWeightsWaitOp : XC_Op<"ld_weights_wait", [Pure]> {
let summary = "Load weights wait op";

let description = [{Load weights wait op.}];

let arguments = (ins Variadic<AnyTensor> : $input);

let results = (outs Variadic<AnyTensor> : $output);
}
Expand Down
2 changes: 1 addition & 1 deletion xformer/Transforms/OptimizeConv2D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ struct ChannelwiseSplitConv2DOutputPattern
return splitResultType;
};

if(!llvm::isa<TFL::QConstOp>(op.getFilter().getDefiningOp()))
if (!llvm::isa<TFL::QConstOp>(op.getFilter().getDefiningOp()))
return failure();

auto filterQConstOp =
Expand Down
1 change: 1 addition & 0 deletions xformer/Transforms/Options.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ extern llvm::cl::opt<unsigned> threadCountOption;
extern llvm::cl::opt<std::string> weightsFilenameOption;
extern llvm::cl::opt<unsigned> loadExternallyIfLargerOption;
extern llvm::cl::opt<bool> weightsAsArrayOption;
extern llvm::cl::opt<bool> asyncLoadWeightsOption;
extern llvm::cl::opt<bool> weightsInExternalMemory;
extern llvm::cl::opt<unsigned> maxLoadExternalSizeOption;
extern llvm::cl::opt<double> convQuantErrorThresholdOption;
Expand Down
8 changes: 5 additions & 3 deletions xformer/Transforms/TranslateToCustomOp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ namespace mlir::xcore {
std::vector<uint8_t> Expand8To16Op::buildCustomOptions() { return {}; }
std::vector<uint8_t> FakeScratchBufferOp::buildCustomOptions() { return {}; }
std::vector<uint8_t> Bsign8Op::buildCustomOptions() { return {}; }
std::vector<uint8_t> LoadWeightsWaitOp::buildCustomOptions() { return {}; }

std::vector<uint8_t> UnaryI16Op::buildCustomOptions() {
flexbuffers::Builder fbb;
Expand Down Expand Up @@ -153,13 +154,13 @@ std::vector<uint8_t> ConcatOp::buildCustomOptions() {
std::vector<uint8_t> LoadWeightsOp::buildCustomOptions() {
flexbuffers::Builder fbb;
auto rootMap = fbb.StartMap();
fbb.Int("addr", (int32_t)getAddress());
auto sizesVec = fbb.StartVector("sizes");
fbb.Int("a", (int32_t)getAddress());
auto sizesVec = fbb.StartVector("s");
for (int i = 0; i < getSizes().cast<ArrayAttr>().size(); ++i) {
fbb.Int(getSizes().cast<ArrayAttr>()[i].cast<IntegerAttr>().getInt());
}
fbb.EndVector(sizesVec, false, false);
fbb.Bool("ddr", (bool)getInDdr());
fbb.Int("t", (int32_t)(symbolizeLoadWeightsOpType(getOpType()).value()));
fbb.EndMap(rootMap);
fbb.Finish();
return fbb.GetBuffer();
Expand Down Expand Up @@ -296,6 +297,7 @@ void TranslateToCustomOp::runOnOperation() {
patterns.insert<RewriteToCustomOp<FakeScratchBufferOp>>(ctx);
patterns.insert<RewriteToCustomOp<FakeSliceOp>>(ctx);
patterns.insert<RewriteToCustomOp<Expand8To16Op>>(ctx);
patterns.insert<RewriteToCustomOp<LoadWeightsWaitOp>>(ctx);

(void)applyPatternsAndFoldGreedily(func, std::move(patterns));
}
Expand Down
64 changes: 62 additions & 2 deletions xformer/Transforms/WriteWeights.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright 2021 XMOS LIMITED. This Software is subject to the terms of the
// XMOS Public License: Version 1

#include "Analysis/MemoryPlan.h"
#include "IR/XCoreOps.h"
#include "Transforms/Options.h"
#include "Utils/FileIO.h"
Expand Down Expand Up @@ -70,6 +71,7 @@ struct WriteWeightsPattern : public OpRewritePattern<LoadConstantOp> {
// load is not from external memory.
// External memory loads have to be aligned to 32 bytes/256 bits for max
// speed
LoadWeightsOpType opType = LoadWeightsOpType::Sync;
if (loadOp.getResult().hasOneUse() && !weightsInExternalMemory) {
auto use = loadOp->use_begin();
Operation *ownerOp = use->getOwner();
Expand All @@ -93,7 +95,7 @@ struct WriteWeightsPattern : public OpRewritePattern<LoadConstantOp> {

auto loadWeightsOp = rewriter.create<LoadWeightsOp>(
loadOp.getLoc(), outputTypes, address,
rewriter.getArrayAttr(dataSizes), /*in_ddr=*/false);
rewriter.getArrayAttr(dataSizes), stringifyLoadWeightsOpType(opType));

for (int i = 0; i < opNums.size(); i++) {
ownerOp->setOperand(opNums[i], loadWeightsOp.getResult(i));
Expand All @@ -111,10 +113,11 @@ struct WriteWeightsPattern : public OpRewritePattern<LoadConstantOp> {
auto toBePaddedSize = alignedSize - loadOpData.size();
// Pad with zeros
tensorData.insert(tensorData.end(), toBePaddedSize, 0);
opType = LoadWeightsOpType::DDR;
}
auto loadWeightsOp = rewriter.create<LoadWeightsOp>(
loadOp.getLoc(), loadOp.getType(), address,
rewriter.getArrayAttr(dataSizes), /*in_ddr=*/weightsInExternalMemory);
rewriter.getArrayAttr(dataSizes), stringifyLoadWeightsOpType(opType));
rewriter.replaceOp(loadOp, loadWeightsOp.getOutput());

// Find all uses of loadWeightsOp and find the first Owner op
Expand All @@ -139,6 +142,34 @@ struct WriteWeightsPattern : public OpRewritePattern<LoadConstantOp> {
std::vector<std::vector<char>> *tensorsVec_;
};

struct LowerToAsyncLoadsPattern : public OpRewritePattern<LoadWeightsOp> {
LowerToAsyncLoadsPattern(MLIRContext *context)
: OpRewritePattern<LoadWeightsOp>(context) {}

LogicalResult matchAndRewrite(LoadWeightsOp loadWeightsOp,
PatternRewriter &rewriter) const override {
if (loadWeightsOp.getOpType() !=
stringifyLoadWeightsOpType(LoadWeightsOpType::Sync)) {
return failure();
}

// We use loadWeightsOp.getResultTypes() as Load Weights op can have
// variadic number of results
auto loadWeightsAsyncOp = rewriter.create<LoadWeightsOp>(
loadWeightsOp.getLoc(), loadWeightsOp.getResultTypes(),
loadWeightsOp.getAddress(), loadWeightsOp.getSizes(),
stringifyLoadWeightsOpType(LoadWeightsOpType::Async));

auto loadWeightsWaitOp = rewriter.create<LoadWeightsWaitOp>(
loadWeightsAsyncOp.getLoc(), loadWeightsAsyncOp.getResultTypes(),
loadWeightsAsyncOp.getResults());

rewriter.replaceOp(loadWeightsOp, loadWeightsWaitOp.getOutput());

return success();
}
};

void WriteWeights::runOnOperation() {
func::FuncOp f = getOperation();
if (weightsFilenameOption.empty()) {
Expand All @@ -154,8 +185,37 @@ void WriteWeights::runOnOperation() {
std::vector<std::vector<char>> tensorsVec;
RewritePatternSet patterns(ctx);
patterns.insert<WriteWeightsPattern>(&tensorsVec, ctx);
if (asyncLoadWeightsOption) {
patterns.insert<LowerToAsyncLoadsPattern>(ctx);
}
(void)applyPatternsAndFoldGreedily(func, std::move(patterns));

// Reorder async load to be before previous convolution
// so that the compute can be overlapped with the load
auto &m = getAnalysis<MemoryPlan>();
auto opIdMap = m.getOperationsIDMap();
auto ops = m.getOperationsSequence();

llvm::SetVector<int> convOpIds;
for (auto o : ops) {
if (llvm::isa<Conv2DV2Op>(o)) {
convOpIds.insert(opIdMap[o]);
}
}

for (auto o : ops) {
if (llvm::isa<LoadWeightsOp>(o)) {
auto ldOp = dyn_cast<LoadWeightsOp>(o);
if (ldOp.getOpType() ==
stringifyLoadWeightsOpType(LoadWeightsOpType::Async)) {
int idx = llvm::lower_bound(convOpIds, opIdMap[o]) - convOpIds.begin();
if (idx > 0) {
o->moveBefore(ops[convOpIds[idx - 1]]);
}
}
}
}

if (failed(utils::writeWeightsToFile(weightsFilenameOption, tensorsVec,
weightsAsArrayOption,
weightsInExternalMemory))) {
Expand Down
16 changes: 14 additions & 2 deletions xformer/XCoreOptMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ cl::opt<bool> weightsInExternalMemory(
"it in external memory."),
cl::init(false), cl::cat(XformerCategory));

cl::opt<bool> asyncLoadWeightsOption(
"xcore-async-load-weights",
cl::desc("Enable loading weights from flash asynchronously. This does not "
"affect loads from external memory."),
cl::init(false), cl::cat(XformerCategory));

cl::opt<unsigned> loadExternallyIfLargerOption(
"xcore-load-externally-if-larger",
cl::desc("Load constants externally if larger than given limit in bytes "
Expand Down Expand Up @@ -470,8 +476,14 @@ int main(int argc, char **argv) {
if (mlir::xcore::weightsInExternalMemory.getNumOccurrences() > 0 &&
mlir::xcore::weightsAsArrayOption.getNumOccurrences() == 0) {
return failedMessage(
"Please specify the xcore-write-weights-as-array"
"when using the xcore-weights-in-external-memory option!");
"Please specify xcore-write-weights-as-array"
" when using the xcore-weights-in-external-memory option!");
}

if (mlir::xcore::weightsInExternalMemory.getNumOccurrences() > 0 &&
mlir::xcore::asyncLoadWeightsOption.getNumOccurrences() > 0) {
return failedMessage("Please don't specify xcore-weights-in-external-memory"
" when using the xcore-async-load-weights option!");
}

if (mlir::xcore::loadExternallyIfLargerOption.getNumOccurrences() > 0 &&
Expand Down
2 changes: 1 addition & 1 deletion xformer/lib_tflite_micro.BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ filegroup(
"lib_tflite_micro/src/tflite-xcore-kernels/xcore_conv2d_v2.cc",
"lib_tflite_micro/src/tflite-xcore-kernels/xcore_maxpool2d.cc",
"lib_tflite_micro/src/tflite-xcore-kernels/xcore_detection_post.cc",
"lib_tflite_micro/src/tflite-xcore-kernels/xcore_load_weights.cc",
"lib_tflite_micro/src/tflite-xcore-kernels/xcore_load_weights_wait.cc",
"lib_tflite_micro/src/tflite-xcore-kernels/xcore_lookup.cc",
"lib_tflite_micro/src/tflite-xcore-kernels/xcore_softmax.cc",
"lib_tflite_micro/src/tflite-xcore-kernels/xcore_batched_softmax.cc",
Expand Down

0 comments on commit 0a63d97

Please sign in to comment.