diff --git a/third_party/lib_tflite_micro b/third_party/lib_tflite_micro
index 6a387b04c..a0e68eea3 160000
--- a/third_party/lib_tflite_micro
+++ b/third_party/lib_tflite_micro
@@ -1 +1 @@
-Subproject commit 6a387b04c20602383ab9af903de5092290d70091
+Subproject commit a0e68eea3892c00afe510bd8de54ff45d9eba53b
diff --git a/xformer/Analysis/MemoryPlan.cpp b/xformer/Analysis/MemoryPlan.cpp
index 9a72a8094..96236b942 100644
--- a/xformer/Analysis/MemoryPlan.cpp
+++ b/xformer/Analysis/MemoryPlan.cpp
@@ -129,7 +129,9 @@ int MemoryPlan::getOffset(Value v, int size,
 
     if ((valueInfo[allocatedVal].firstUsed > valueInfo[v].lastUsed) ||
         (valueInfo[v].firstUsed > valueInfo[allocatedVal].lastUsed)) {
-      // No overlap
+      // There is no overlap with this buffer. We move on until we have a clash.
+      // When there is a clash, we know we can allocate before that one if there
+      // is space as we don't overlap with any of those buffers.
       continue;
     }
 
@@ -149,6 +151,70 @@ int MemoryPlan::getOffset(Value v, int size,
   return offset;
 }
 
+void MemoryPlan::buildInputOutputTensorMaps(
+    llvm::StringMap<Value> &inputTensorMap,
+    llvm::StringMap<Value> &outputTensorMap) {
+  auto buildMap = [&](StringRef argAttr, StringRef nameAttr,
+                      llvm::SmallVector<std::string> &attrsInOrder) {
+    llvm::StringMap<std::string> map;
+    llvm::SmallVector<std::string> argNames;
+    auto funcOp = dyn_cast<func::FuncOp>(op);
+
+    llvm::SmallVector<llvm::StringRef, 2> inputNames;
+    auto dictAttr =
+        funcOp->getAttrOfType<mlir::DictionaryAttr>("tf.entry_function");
+    if (auto str =
+            dictAttr.get(nameAttr).dyn_cast_or_null<mlir::StringAttr>()) {
+      str.getValue().split(inputNames, ',', /*MaxSplit=*/-1,
+                           /*KeepEmpty=*/false);
+    }
+
+    auto argAttrs = funcOp->getAttrOfType<mlir::ArrayAttr>(argAttr);
+    if (argAttrs) {
+      for (auto attr : argAttrs) {
+        auto d = attr.dyn_cast_or_null<mlir::DictionaryAttr>();
+
+        const ArrayRef<Attribute> indexPathAttrs =
+            d.get("tf_saved_model.index_path").cast<ArrayAttr>().getValue();
+        auto stringAttr =
+            indexPathAttrs[0].dyn_cast_or_null<mlir::StringAttr>();
+        if (!stringAttr)
+          continue;
+        argNames.push_back(stringAttr.getValue().str());
+      }
+    } else {
+      for (int i = 0; i < inputNames.size(); i++) {
+        argNames.push_back(inputNames[i].str());
+      }
+    }
+
+    assert(argNames.size() == inputNames.size());
+    for (int i = 0; i < inputNames.size(); i++) {
+      map[inputNames[i].str()] = argNames[i];
+      attrsInOrder.push_back(argNames[i]);
+    }
+    return map;
+  };
+
+  llvm::StringMap<std::string> inNameToAttrMap, outNameToAttrMap;
+  llvm::SmallVector<std::string> attrsInOrder;
+
+  inNameToAttrMap = buildMap("arg_attrs", "inputs", attrsInOrder);
+  outNameToAttrMap = buildMap("res_attrs", "outputs", attrsInOrder);
+
+  for (int i = 0; i < inNameToAttrMap.size(); i++) {
+    inputTensorMap[attrsInOrder[i]] = values[i];
+  }
+
+  for (auto v : values) {
+    if (auto loc = v.getLoc()->dyn_cast_or_null<NameLoc>()) {
+      if (outNameToAttrMap.count(loc.getName())) {
+        outputTensorMap[outNameToAttrMap[loc.getName()]] = v;
+      }
+    }
+  }
+}
+
 std::vector<int> MemoryPlan::getAllocatedOffsets(const bool overlapOps,
                                                  int &peakMemoryUsed,
                                                  int &peakOpId) {
@@ -245,6 +311,22 @@ std::vector<int> MemoryPlan::getAllocatedOffsets(const bool overlapOps,
     }
   }
 
+  // Handle input output tensor same allocations
+  llvm::DenseSet<Value> inputTensorSet;
+  llvm::DenseSet<Value> outputTensorSet;
+  llvm::StringMap<Value> inputTensorMap, outputTensorMap;
+
+  if (sameAllocationInputOutputTensorOption.size() > 0) {
+    buildInputOutputTensorMaps(inputTensorMap, outputTensorMap);
+    for (int i = 0; i < sameAllocationInputOutputTensorOption.size();
+         i = i + 2) {
+      inputTensorSet.insert(
+          inputTensorMap[sameAllocationInputOutputTensorOption[i]]);
+      outputTensorSet.insert(
+          outputTensorMap[sameAllocationInputOutputTensorOption[i + 1]]);
+    }
+  }
+
   // The comparator keeps the buffers ordered by id if their sizes are the
   // same
   auto DecreasingSizesComparator = [&](QueueItem &lhs, QueueItem &rhs) {
@@ -259,23 +341,51 @@ std::vector<int> MemoryPlan::getAllocatedOffsets(const bool overlapOps,
       queue(DecreasingSizesComparator);
 
   // Insert values and their sizes into priority queue
+  // InOutmap prevents adding in values which are overlapped
+  // In a chain of overlapped values, only the last value is allocated and the
+  // rest are patched up and add in allocated values list later
+  // Don't insert same allocation input and output tensors into queue as they
+  // are allocated separately
   for (auto v : values) {
-    if (!inOutMap.count(v) && !vInfo[v].isConstant) {
+    if (!inOutMap.count(v) && !vInfo[v].isConstant &&
+        !outputTensorSet.contains(v) && !inputTensorSet.contains(v)) {
       queue.push({v, vInfo[v].size});
     }
   }
 
   ValuesOrderedByOffset allocatedValues;
-  auto v = queue.top().first;
-  queue.pop();
-  allocatedValues.insert({v, 0});
+
+  // If there are same allocation input and output tensors, allocate those first
+  if (sameAllocationInputOutputTensorOption.size() > 0) {
+    // Allocate first input and output tensor with offsets of zero
+    allocatedValues.insert(
+        {inputTensorMap[sameAllocationInputOutputTensorOption[0]], 0});
+    allocatedValues.insert(
+        {outputTensorMap[sameAllocationInputOutputTensorOption[1]], 0});
+
+    for (int i = 2; i < sameAllocationInputOutputTensorOption.size();
+         i = i + 2) {
+      auto inputTensor =
+          inputTensorMap[sameAllocationInputOutputTensorOption[i]];
+      int newOffset = getOffset(inputTensor, vInfo[inputTensor].size, vInfo,
+                                allocatedValues);
+      allocatedValues.insert({inputTensor, newOffset});
+      allocatedValues.insert(
+          {outputTensorMap[sameAllocationInputOutputTensorOption[i + 1]],
+           newOffset});
+    }
+  } else {
+    // Else allocate the largest tensor at offset zero
+    auto v = queue.top().first;
+    queue.pop();
+    allocatedValues.insert({v, 0});
+  }
 
   while (!queue.empty()) {
     auto v = queue.top().first;
     auto size = queue.top().second;
     queue.pop();
 
-    // check with allocatedValues list
     int newOffset = getOffset(v, size, vInfo, allocatedValues);
     allocatedValues.insert({v, newOffset});
   }
@@ -313,6 +423,37 @@ std::vector<int> MemoryPlan::getAllocatedOffsets(const bool overlapOps,
     allocatedValuesOrderedByID.insert(i);
   }
 
+  // Check if buffers clash
+  // for (auto i : allocatedValuesOrderedByID) {
+  //   for (auto j : allocatedValuesOrderedByID) {
+  //     if (vInfo[i.first].id < vInfo[j.first].id) {
+  //       if ((vInfo[i.first].firstUsed > vInfo[j.first].firstUsed &&
+  //            vInfo[i.first].firstUsed < vInfo[j.first].lastUsed) ||
+  //           (vInfo[j.first].firstUsed > vInfo[i.first].firstUsed &&
+  //            vInfo[j.first].firstUsed < vInfo[i.first].lastUsed)) {
+  //         auto iBegin = i.second;
+  //         auto iEnd = i.second + vInfo[i.first].size;
+  //         auto jBegin = j.second;
+  //         auto jEnd = j.second + vInfo[j.first].size;
+  //         if ((iBegin > jBegin && iBegin < jEnd) ||
+  //             (jBegin > iBegin && jBegin < iEnd)) {
+  //           printf("\n\nProblem!");
+  //           std::cout << "\nValue one " << vInfo[i.first].id
+  //                     << ", size = " << vInfo[i.first].size
+  //                     << ", offset = " << i.second
+  //                     << ", first = " << vInfo[i.first].firstUsed
+  //                     << ", last = " << vInfo[i.first].lastUsed;
+  //           std::cout << "\nValue two " << vInfo[j.first].id
+  //                     << ", size = " << vInfo[j.first].size
+  //                     << ", offset = " << j.second
+  //                     << ", first = " << vInfo[j.first].firstUsed
+  //                     << ", last = " << vInfo[j.first].lastUsed;
+  //         }
+  //       }
+  //     }
+  //   }
+  // }
+
   size_t peakUsed = 0;
   size_t peakUsedValueID = 0;
   size_t maxId = 0;
diff --git a/xformer/Analysis/MemoryPlan.h b/xformer/Analysis/MemoryPlan.h
index 49c463145..fde7f3248 100644
--- a/xformer/Analysis/MemoryPlan.h
+++ b/xformer/Analysis/MemoryPlan.h
@@ -7,12 +7,21 @@
 #include "mlir/Analysis/Liveness.h"
 #include "mlir/IR/Value.h"
 #include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/StringMap.h"
 
 #include <set>
 
 namespace mlir {
 namespace xcore {
 
+struct ValueInfo {
+  size_t id;
+  size_t size;
+  bool isConstant;
+  int firstUsed;
+  int lastUsed;
+};
+
 // Represents an analysis for memory planning of a given FuncOp for a model.
 // - Uses liveness analysis and a greedy algorithm to arrange buffers in memory.
 // - Tries to overlap input and output buffers based on the op characteristics.
@@ -51,6 +60,11 @@ class MemoryPlan {
 
   std::vector<Operation *> getOperationsSequence() { return operations; }
 
+  DenseMap<Value, ValueInfo> getValuesInfoMap() { return valueInfo; }
+
+  void buildInputOutputTensorMaps(llvm::StringMap<Value> &inputTensorMap,
+                                  llvm::StringMap<Value> &outputTensorMap);
+
   // OpSplitPlan getOpSplitPlan();
 
   void printMemoryPlan();
@@ -70,14 +84,6 @@ class MemoryPlan {
   using ValuesOrderedByOffset =
       std::multiset<QueueItem, IncreasingOffsetsComparator>;
 
-  struct ValueInfo {
-    size_t id;
-    size_t size;
-    bool isConstant;
-    int firstUsed;
-    int lastUsed;
-  };
-
   int getOffset(Value v, int size, DenseMap<Value, ValueInfo> &valueInfo,
                 ValuesOrderedByOffset &allocatedOffsets);
 
diff --git a/xformer/Transforms/Options.h b/xformer/Transforms/Options.h
index b200309c5..046e7e32c 100644
--- a/xformer/Transforms/Options.h
+++ b/xformer/Transforms/Options.h
@@ -32,6 +32,8 @@ extern llvm::cl::opt<bool> convDebugOption;
 extern llvm::cl::opt<bool> overlapConvOption;
 extern llvm::cl::opt<bool> offlineOffsetsOption;
 extern llvm::cl::opt<unsigned> convChannelwiseSplitSizeOption;
+extern llvm::cl::list<std::string> sameAllocationInputOutputTensorOption;
+
 } // namespace xcore
 } // namespace mlir
 
diff --git a/xformer/Transforms/Passes.cpp b/xformer/Transforms/Passes.cpp
index 6acce3f26..11aff6de7 100644
--- a/xformer/Transforms/Passes.cpp
+++ b/xformer/Transforms/Passes.cpp
@@ -15,6 +15,7 @@ void buildXCorePreOpSplitPassPipeline(OpPassManager &pm) {
   // Run pass from LCE to convert Larq ops which are in TFL custom op format to
   // Larq dialect
   pm.addPass(mlir::TFL::CreateTranslateToLCEPass());
+  pm.addPass(createVerifySameAllocationTensorsPass());
   // Convert dynamic shapes in batch dimension to static
   pm.addPass(createRemoveDynamicShapePass());
 }
diff --git a/xformer/Transforms/Passes.h b/xformer/Transforms/Passes.h
index dfdc15a7e..95eb57dda 100644
--- a/xformer/Transforms/Passes.h
+++ b/xformer/Transforms/Passes.h
@@ -30,6 +30,8 @@ std::unique_ptr<OperationPass<func::FuncOp>> createReplaceFCWithConv2DPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createOptimizeConv2DPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createOpSplitPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createApplyTFLPatternsPass();
+std::unique_ptr<OperationPass<func::FuncOp>>
+createVerifySameAllocationTensorsPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createRemoveDynamicShapePass();
 std::unique_ptr<OperationPass<func::FuncOp>> createReplaceAddSubPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createReplaceMulPass();
diff --git a/xformer/Transforms/VerifySameAllocationTensors.cpp b/xformer/Transforms/VerifySameAllocationTensors.cpp
new file mode 100644
index 000000000..7f9f0a424
--- /dev/null
+++ b/xformer/Transforms/VerifySameAllocationTensors.cpp
@@ -0,0 +1,166 @@
+// Copyright 2021 XMOS LIMITED. This Software is subject to the terms of the
+// XMOS Public License: Version 1
+
+#include "Analysis/MemoryPlan.h"
+#include "IR/XCoreOps.h"
+#include "Transforms/Options.h"
+#include "Utils/Util.h"
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+
+namespace mlir::xcore {
+
+namespace {
+struct VerifySameAllocationTensors
+    : public PassWrapper<VerifySameAllocationTensors,
+                         OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(VerifySameAllocationTensors)
+
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry.insert<TFL::TFLDialect>();
+  }
+  StringRef getArgument() const final { return "xcore-preset-allocations"; }
+  StringRef getDescription() const final { return "Remove dynamic shape"; }
+  void runOnOperation() override;
+};
+
+void VerifySameAllocationTensors::runOnOperation() {
+  auto func = getOperation();
+  auto *ctx = &getContext();
+
+  if (sameAllocationInputOutputTensorOption.size() > 0) {
+
+    auto &m = getAnalysis<MemoryPlan>();
+    llvm::StringMap<Value> inputTensorMap, outputTensorMap;
+    m.buildInputOutputTensorMaps(inputTensorMap, outputTensorMap);
+
+    bool failed = false;
+    // Check names of input and output tensors
+    for (int i = 0; i < sameAllocationInputOutputTensorOption.size();
+         i = i + 2) {
+      if (!inputTensorMap.count(sameAllocationInputOutputTensorOption[i])) {
+        func.emitError()
+            << sameAllocationInputOutputTensorOption[i]
+            << " not present in input tensors. Please check the name!";
+        failed = true;
+      }
+      if (!outputTensorMap.count(
+              sameAllocationInputOutputTensorOption[i + 1])) {
+        func.emitError()
+            << sameAllocationInputOutputTensorOption[i + 1]
+            << " not present in output tensors. Please check the name!";
+        failed = true;
+      }
+    }
+
+    if (failed) {
+      signalPassFailure();
+      return;
+    }
+
+    // Check sizes
+    auto vInfo = m.getValuesInfoMap();
+    for (int i = 0; i < sameAllocationInputOutputTensorOption.size();
+         i = i + 2) {
+      if (vInfo[inputTensorMap[sameAllocationInputOutputTensorOption[i]]]
+              .size !=
+          vInfo[outputTensorMap[sameAllocationInputOutputTensorOption[i + 1]]]
+              .size) {
+        func.emitError() << "Size of input tensor "
+                         << sameAllocationInputOutputTensorOption[i]
+                         << " is not equal to output tensor "
+                         << sameAllocationInputOutputTensorOption[i + 1]
+                         << ". Please check!";
+        failed = true;
+      }
+    }
+
+    // Check quantization
+    for (int i = 0; i < sameAllocationInputOutputTensorOption.size();
+         i = i + 2) {
+      auto inQType = dyn_cast_or_null<quant::UniformQuantizedType>(
+          inputTensorMap[sameAllocationInputOutputTensorOption[i]]
+              .getType()
+              .cast<RankedTensorType>()
+              .getElementType());
+      auto outQType = dyn_cast_or_null<quant::UniformQuantizedType>(
+          outputTensorMap[sameAllocationInputOutputTensorOption[i + 1]]
+              .getType()
+              .cast<RankedTensorType>()
+              .getElementType());
+      if (inQType && !outQType) {
+        func.emitError() << "Input tensor "
+                         << sameAllocationInputOutputTensorOption[i]
+                         << " is quantized, but "
+                         << sameAllocationInputOutputTensorOption[i + 1]
+                         << " is not. Please check!";
+        failed = true;
+      } else if (!inQType && outQType) {
+        func.emitError() << "Input tensor "
+                         << sameAllocationInputOutputTensorOption[i]
+                         << " is not quantized, but "
+                         << sameAllocationInputOutputTensorOption[i + 1]
+                         << " is quantized. Please check!";
+        failed = true;
+      } else if (inQType && outQType) {
+        // Both are quantized, but check element sizes, maybe i8 and i16
+
+        auto inScale = inQType.getScale();
+        auto inZeroPoint = inQType.getZeroPoint();
+
+        auto outScale = outQType.getScale();
+        auto outZeroPoint = outQType.getZeroPoint();
+        if (inScale != outScale || inZeroPoint != outZeroPoint) {
+          // change input block arg to output quantization
+
+          // insert quantize op to convert back to original input quantization
+          // auto module = func->getParentOfType<ModuleOp>();
+          // OpBuilder builder(module);
+          // auto outVal =
+          // outputTensorMap[sameAllocationInputOutputTensorOption[i
+          // + 1]]; auto newQType = inQType.castFromExpressedType(
+          //     quant::QuantizedType::castToExpressedType(outVal.getType()));
+          // auto newQuantizeOp = builder.create<TFL::QuantizeOp>(
+          //     inVal.getLoc(), newQType, outVal, TypeAttr::get(inQType));
+
+          auto inVal = inputTensorMap[sameAllocationInputOutputTensorOption[i]];
+          auto typeNumBits =
+              utils::getTypeSize(
+                  inVal.getType().cast<RankedTensorType>().getElementType()) *
+              8;
+          double maxError = 1.0 / (2 << (typeNumBits - 1));
+          if (abs(inScale - outScale) > maxError) {
+            func.emitError()
+                << "Input tensor " << sameAllocationInputOutputTensorOption[i]
+                << " has scale of " << inScale << " and zeropoint of "
+                << inZeroPoint << ", but output tensor "
+                << sameAllocationInputOutputTensorOption[i + 1]
+                << " has scale of " << outScale << " and zeropoint of "
+                << outZeroPoint << ". Please check!";
+            failed = true;
+          }
+        }
+      } else if (!inQType && !outQType) {
+        // Both are not quantized, but check element sizes, maybe i8 and i16
+      }
+    }
+
+    if (failed) {
+      signalPassFailure();
+      return;
+    }
+  }
+}
+} // namespace
+
+// Creates an instance of the VerifySameAllocationTensors pass.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createVerifySameAllocationTensorsPass() {
+  return std::make_unique<VerifySameAllocationTensors>();
+}
+
+static PassRegistration<VerifySameAllocationTensors> pass;
+
+} // namespace mlir::xcore
diff --git a/xformer/XCoreOptMain.cpp b/xformer/XCoreOptMain.cpp
index 63310d300..26cadeaba 100644
--- a/xformer/XCoreOptMain.cpp
+++ b/xformer/XCoreOptMain.cpp
@@ -36,6 +36,13 @@ namespace mlir::xcore {
 // and -help) will be hidden.
 static cl::OptionCategory XformerCategory("Xformer options");
 
+llvm::cl::list<std::string> sameAllocationInputOutputTensorOption(
+    "xcore-same-allocation-input-output-tensor",
+    cl::desc("Allocate this input and output tensor in the same memory "
+             "location. This helps avoiding a memcopy from output to input in "
+             "case of recurrent networks. The first tensor must be the input."),
+    cl::CommaSeparated, cl::cat(XformerCategory));
+
 cl::opt<bool> enableMemoryAnalysisOption(
     "xcore-run-memory-analysis",
     cl::desc("Run memory analysis to aid in operation splitting."),
@@ -507,6 +514,24 @@ int main(int argc, char **argv) {
     return failedMessage("Please specify a thread count between one and five!");
   }
 
+  llvm::DenseMap<int, int> positionCountMap;
+  for (int i = 0; i < mlir::xcore::sameAllocationInputOutputTensorOption.size();
+       i++) {
+    int pos = mlir::xcore::sameAllocationInputOutputTensorOption.getPosition(i);
+    if (positionCountMap.count(pos)) {
+      positionCountMap[pos]++;
+    } else {
+      positionCountMap[pos] = 1;
+    }
+  }
+  for (auto i : positionCountMap) {
+    if (i.second != 2) {
+      return failedMessage(
+          "Please specify two tensors, an input tensor and output tensor for "
+          "each of xcore-same-allocation-input-output-tensor options!");
+    }
+  }
+
   if (failed(isCompatibleVersion(
           versionLibTfliteMicro, lib_tflite_micro::major_version,
           lib_tflite_micro::minor_version, lib_tflite_micro::patch_version))) {