diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
index 8ccd7c1011..f5d88f102f 100644
--- a/lib/kernels/CMakeLists.txt
+++ b/lib/kernels/CMakeLists.txt
@@ -7,8 +7,7 @@ file(GLOB_RECURSE SRC
      CONFIGURE_DEPENDS
      LIST_DIRECTORIES False
      src/*.cc
-     src/cuda/cuda_helper.cu
-     src/cuda/ops/*.cu
+     src/cuda/*.cu
      )
 
 add_library(
@@ -30,6 +29,7 @@ target_link_libraries(
   cudnn
   nccl
   utils
+  pcg
 )
 
 define_ff_vars(${project_target})
diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index 39da65c3be..487bc1f8f0 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -5,11 +5,95 @@
 #include "device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/datatype.h"
+#include "pcg/device_type.dtg.h"
 #include "utils/exception.h"
 #include "utils/required.h"
 
 namespace FlexFlow {
 
+class GenericTensorAccessorR {
+public:
+  template <DataType DT>
+  typename data_type_enum_to_class<DT>::type const *get() const {
+    if (this->data_type == DT) {
+      return static_cast<real_type_t<DT> const *>(this->ptr);
+    } else {
+      throw mk_runtime_error(fmt::format(
+          "Invalid access data type ({} != {})", this->data_type, DT));
+    }
+  }
+
+  int32_t const *get_int32_ptr() const;
+  int64_t const *get_int64_ptr() const;
+  float const *get_float_ptr() const;
+  double const *get_double_ptr() const;
+  half const *get_half_ptr() const;
+
+  GenericTensorAccessorR() = delete;
+
+  GenericTensorAccessorR(DataType data_type,
+                         ArrayShape const &shape,
+                         void const *ptr,
+                         DeviceType device_type);
+
+  bool operator==(GenericTensorAccessorR const &) const;
+  bool operator!=(GenericTensorAccessorR const &) const;
+
+  template <DataType DT>
+  real_type_t<DT> const &at(std::vector<int> const &indices) const {
+    if (this->device_type != DeviceType::CPU) {
+      throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
+    }
+    if (this->data_type != DT) {
+      throw mk_runtime_error(fmt::format(
+          "Invalid access data type ({} != {})", this->data_type, DT));
+    }
+    if (indices.size() != this->shape.num_dims()) {
+      throw mk_runtime_error(fmt::format("Number of indices ({}) does not "
+                                         "match the number of dimensions ({}).",
+                                         indices.size(),
+                                         this->shape.num_dims()));
+    }
+
+    using T = real_type_t<DT>;
+    T const *data_ptr = static_cast<T const *>(this->ptr);
+
+    int offset = 0;
+    int multiplier = 1;
+    for (int i = 0; i < this->shape.num_dims(); i++) {
+      if (indices.at(i) >= this->shape.at(legion_dim_t{i})) {
+        throw mk_runtime_error(
+            fmt::format("In {} dimension, attempting to access index {} "
+                        "when only {} indexes exist",
+                        i,
+                        indices.at(i),
+                        this->shape.at(legion_dim_t{i})));
+      }
+
+      offset += indices.at(i) * multiplier;
+      multiplier *= this->shape.at(legion_dim_t{i});
+    }
+
+    return data_ptr[offset];
+  }
+
+public:
+  DataType data_type;
+  ArrayShape shape;
+  void const *ptr;
+  DeviceType device_type;
+
+private:
+  std::tuple<decltype(data_type) const &,
+             decltype(shape) const &,
+             decltype(ptr) const &,
+             decltype(device_type) const &>
+      tie() const;
+};
+
+std::string format_as(GenericTensorAccessorR const &);
+std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
+
 class GenericTensorAccessorW {
 public:
   template <DataType DT>
@@ -28,64 +112,110 @@ class GenericTensorAccessorW {
   double *get_double_ptr() const;
   half *get_half_ptr() const;
 
-public:
-  DataType data_type;
-  ArrayShape shape;
-  req<void *> ptr;
-};
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW,
-                                             data_type,
-                                             shape,
-                                             ptr);
+  GenericTensorAccessorW() = delete;
 
-std::string format_as(GenericTensorAccessorW const &);
-std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
+  GenericTensorAccessorW(DataType data_type,
+                         ArrayShape const &shape,
+                         void *ptr,
+                         DeviceType device_type);
+
+  bool operator==(GenericTensorAccessorW const &) const;
+  bool operator!=(GenericTensorAccessorW const &) const;
+
+  operator GenericTensorAccessorR() const;
 
-class GenericTensorAccessorR {
-public:
   template <DataType DT>
-  typename data_type_enum_to_class<DT>::type const *get() const {
-    if (this->data_type == DT) {
-      return static_cast<real_type_t<DT> const *>(this->ptr);
-    } else {
+  real_type_t<DT> &at(std::vector<int> const &indices) {
+    if (this->device_type != DeviceType::CPU) {
+      throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
+    }
+    if (this->data_type != DT) {
       throw mk_runtime_error(fmt::format(
           "Invalid access data type ({} != {})", this->data_type, DT));
     }
+    if (indices.size() != this->shape.num_dims()) {
+      throw mk_runtime_error(fmt::format("Number of indices ({}) does not "
+                                         "match the number of dimensions ({}).",
+                                         indices.size(),
+                                         this->shape.num_dims()));
+    }
+
+    using T = real_type_t<DT>;
+
+    T *data_ptr = static_cast<T *>(this->ptr);
+    int offset = 0;
+    int multiplier = 1;
+    for (int i = 0; i < this->shape.num_dims(); i++) {
+      if (indices.at(i) >= this->shape.at(legion_dim_t{i})) {
+        throw mk_runtime_error(
+            fmt::format("In {} dimension, attempting to access index {} "
+                        "when only {} indexes exist",
+                        i,
+                        indices.at(i),
+                        this->shape.at(legion_dim_t{i})));
+      }
+
+      offset += indices.at(i) * multiplier;
+      multiplier *= this->shape.at(legion_dim_t{i});
+    }
+
+    return data_ptr[offset];
   }
 
-  int32_t const *get_int32_ptr() const;
-  int64_t const *get_int64_ptr() const;
-  float const *get_float_ptr() const;
-  double const *get_double_ptr() const;
-  half const *get_half_ptr() const;
+  template <DataType DT>
+  real_type_t<DT> &at(std::vector<int> const &indices) const {
+    if (this->device_type != DeviceType::CPU) {
+      throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
+    }
+    if (this->data_type != DT) {
+      throw mk_runtime_error(fmt::format(
+          "Invalid access data type ({} != {})", this->data_type, DT));
+    }
+    if (indices.size() != this->shape.num_dims()) {
+      throw mk_runtime_error(fmt::format("Number of indices ({}) does not "
+                                         "match the number of dimensions ({}).",
+                                         indices.size(),
+                                         this->shape.num_dims()));
+    }
+
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = static_cast<T const *>(this->ptr);
+    int offset = 0;
+    int multiplier = 1;
+    for (int i = 0; i < this->shape.num_dims(); i++) {
+      if (indices.at(i) >= this->shape.at(legion_dim_t{i})) {
+        throw mk_runtime_error(
+            fmt::format("In {} dimension, attempting to access index {} "
+                        "when only {} indexes exist",
+                        i,
+                        indices.at(i),
+                        this->shape.at(legion_dim_t{i})));
+      }
+
+      offset += indices.at(i) * multiplier;
+      multiplier *= this->shape.at(legion_dim_t{i});
+    }
+
+    return data_ptr[offset];
+  }
 
 public:
   DataType data_type;
   ArrayShape shape;
-  req<void const *> ptr;
-};
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR,
-                                             data_type,
-                                             shape,
-                                             ptr);
+  void *ptr;
+  DeviceType device_type;
 
-std::string format_as(GenericTensorAccessorR const &);
-std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
+private:
+  std::tuple<decltype(data_type) const &,
+             decltype(shape) const &,
+             decltype(ptr) const &,
+             decltype(device_type) const &>
+      tie() const;
+};
 
-int32_t *get_int32_ptr(GenericTensorAccessorW const &);
-int64_t *get_int64_ptr(GenericTensorAccessorW const &);
-float *get_float_ptr(GenericTensorAccessorW const &);
-double *get_double_ptr(GenericTensorAccessorW const &);
-half *get_half_ptr(GenericTensorAccessorW const &);
-std::vector<int32_t *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<int64_t *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<float *>
-    get_float_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<double *>
-    get_double_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<half *> get_half_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::string format_as(GenericTensorAccessorW const &);
+std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
 
 static_assert(is_fmtable<req<DataType> const &>::value, "");
 
@@ -137,6 +267,21 @@ std::vector<double const *>
 std::vector<half const *>
     get_half_ptrs(std::vector<GenericTensorAccessorR> const &);
 
+int32_t *get_int32_ptr(GenericTensorAccessorW const &);
+int64_t *get_int64_ptr(GenericTensorAccessorW const &);
+float *get_float_ptr(GenericTensorAccessorW const &);
+double *get_double_ptr(GenericTensorAccessorW const &);
+half *get_half_ptr(GenericTensorAccessorW const &);
+std::vector<int32_t *>
+    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<int64_t *>
+    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<float *>
+    get_float_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<double *>
+    get_double_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<half *> get_half_ptrs(std::vector<GenericTensorAccessorW> const &);
+
 template <DataType DT>
 std::vector<real_type_t<DT> const *>
     get(std::vector<GenericTensorAccessorR> const &accs) {
@@ -150,12 +295,8 @@ std::vector<real_type_t<DT> const *>
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &write_accessor);
 
-bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
-                              GenericTensorAccessorW const &acc2);
-
-bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype);
+bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
+                              GenericTensorAccessorR const &acc2);
 
 bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
                              ArrayShape const &expected_shape,
@@ -163,8 +304,9 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
 
 std::pair<ArrayShape, DataType>
     get_shape_and_datatype(GenericTensorAccessorR const &accessor);
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorW const &accessor);
+
+void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
+                                    GenericTensorAccessorR const &src_accessor);
 
 } // namespace FlexFlow
 
diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h
index 6500899394..4bf97118ce 100644
--- a/lib/kernels/include/kernels/allocation.h
+++ b/lib/kernels/include/kernels/allocation.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_ALLOCATION_H
 #define _FLEXFLOW_KERNELS_ALLOCATION_H
 
-#include "accessor.h"
+#include "kernels/accessor.h"
 #include <cstddef>
 #include <memory>
 
@@ -11,6 +11,8 @@ struct IAllocator {
   virtual void *allocate(size_t) = 0;
   virtual void deallocate(void *) = 0;
 
+  virtual DeviceType get_allocation_device_type() const = 0;
+
   virtual ~IAllocator() = default;
 };
 
@@ -18,9 +20,12 @@ struct Allocator {
   Allocator() = delete;
 
   GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape);
+
   void *allocate(size_t mem_size);
   void deallocate(void *ptr);
 
+  DeviceType get_allocation_device_type() const;
+
   template <typename T, typename... Args>
   static typename std::enable_if<std::is_base_of<IAllocator, T>::value,
                                  Allocator>::type
diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h
index eb5a1b8198..1e483102dd 100644
--- a/lib/kernels/include/kernels/attention_kernels.h
+++ b/lib/kernels/include/kernels/attention_kernels.h
@@ -64,8 +64,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState,
 std::string format_as(MHAPerDeviceState const &x);
 std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x);
 
-namespace Kernels {
-namespace MultiHeadAttention {
+namespace Kernels::MultiHeadAttention {
 
 MHAPerDeviceState init_kernel(PerDeviceFFHandle const &,
                               Allocator &,
@@ -105,8 +104,7 @@ void backward_kernel(ffStream_t stream,
 void cleanup_kernel(Allocator &allocator,
                     MHAPerDeviceState const &device_state);
 
-} // namespace MultiHeadAttention
-} // namespace Kernels
+} // namespace Kernels::MultiHeadAttention
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h
index bfd72647b0..bde91bea15 100644
--- a/lib/kernels/include/kernels/batch_matmul_kernels.h
+++ b/lib/kernels/include/kernels/batch_matmul_kernels.h
@@ -5,9 +5,7 @@
 #include "kernels/allocation.h"
 #include "kernels/ff_handle.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace BatchMatmul {
+namespace FlexFlow::Kernels::BatchMatmul {
 
 void forward_kernel(ffStream_t stream,
                     PerDeviceFFHandle const &handle,
@@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream,
                      int k,
                      int batch);
 
-} // namespace BatchMatmul
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::BatchMatmul
 
 #endif
diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h
index 7d533d672c..3fea92c86b 100644
--- a/lib/kernels/include/kernels/batch_norm_kernels.h
+++ b/lib/kernels/include/kernels/batch_norm_kernels.h
@@ -43,8 +43,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(BatchNormPerDeviceState,
                                              output_w,
                                              relu);
 
-namespace Kernels {
-namespace BatchNorm {
+namespace Kernels::BatchNorm {
 
 BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                     Allocator allocator,
@@ -64,9 +63,9 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      BatchNormPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *output_grad_ptr,
                      float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
                      float *input_grad_ptr,
                      float const *scale_ptr,
                      float *scale_grad_ptr,
@@ -81,8 +80,7 @@ void cleanup_kernel(Allocator allocator,
                     bool relu,
                     float *runningMean);
 
-} // namespace BatchNorm
-} // namespace Kernels
+} // namespace Kernels::BatchNorm
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h
index 96f9aadd52..da13e0036d 100644
--- a/lib/kernels/include/kernels/cast_kernels.h
+++ b/lib/kernels/include/kernels/cast_kernels.h
@@ -3,27 +3,17 @@
 
 #include "device.h"
 #include "kernels/accessor.h"
-#include "kernels/ff_handle.h"
-#include "op-attrs/activation.dtg.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Cast {
+namespace FlexFlow::Kernels::Cast {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    DataType input_type,
-                    DataType output_type);
+                    GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output,
-                     DataType input_type,
-                     DataType output_type);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input);
 
-} // namespace Cast
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Cast
 
 #endif
diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h
new file mode 100644
index 0000000000..a5df80d4da
--- /dev/null
+++ b/lib/kernels/include/kernels/cast_kernels_cpu.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
+
+#include "device.h"
+#include "kernels/accessor.h"
+
+namespace FlexFlow::Kernels::Cast {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW const &input);
+
+} // namespace FlexFlow::Kernels::Cast
+
+#endif
diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h
index eb263e0734..50de18e823 100644
--- a/lib/kernels/include/kernels/combine_kernels.h
+++ b/lib/kernels/include/kernels/combine_kernels.h
@@ -4,9 +4,7 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Combine {
+namespace FlexFlow::Kernels::Combine {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
@@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR const &output_grad,
                      GenericTensorAccessorW const &input_grad);
 
-} // namespace Combine
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Combine
 
 #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H
diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h
new file mode 100644
index 0000000000..430c7cf906
--- /dev/null
+++ b/lib/kernels/include/kernels/combine_kernels_cpu.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
+
+#include "device.h"
+#include "kernels/accessor.h"
+
+namespace FlexFlow::Kernels::Combine {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad);
+
+} // namespace FlexFlow::Kernels::Combine
+
+#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h
index a44affc1f2..33355296dd 100644
--- a/lib/kernels/include/kernels/concat_kernels.h
+++ b/lib/kernels/include/kernels/concat_kernels.h
@@ -4,9 +4,7 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Concat {
+namespace FlexFlow::Kernels::Concat {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorW const &output,
@@ -18,8 +16,6 @@ void backward_kernel(ffStream_t stream,
                      std::vector<GenericTensorAccessorW> const &input_grads,
                      ff_dim_t axis);
 
-} // namespace Concat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Concat
 
 #endif
diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h
index cfc64f963d..f49c8f50f4 100644
--- a/lib/kernels/include/kernels/conv_2d_kernels.h
+++ b/lib/kernels/include/kernels/conv_2d_kernels.h
@@ -34,8 +34,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState,
                                              bwdFilterAlgo,
                                              bwdDataAlgo);
 
-namespace Kernels {
-namespace Conv2D {
+namespace Kernels::Conv2D {
 
 Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  std::optional<Activation> activation,
@@ -61,17 +60,16 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      Conv2DPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
                      float const *output_ptr,
                      float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
                      float const *filter_ptr,
                      float *filter_grad_ptr,
                      float *bias_grad_ptr,
                      std::optional<Activation> activation);
 
-} // namespace Conv2D
-} // namespace Kernels
+} // namespace Kernels::Conv2D
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H
diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h
new file mode 100644
index 0000000000..da8af71e4f
--- /dev/null
+++ b/lib/kernels/include/kernels/copy_tensor_accessor.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H
+#define _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H
+
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+
+namespace FlexFlow {
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
+                           Allocator &allocator);
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
+                           Allocator &allocator);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h
index e83fc3325d..50ca66a820 100644
--- a/lib/kernels/include/kernels/datatype_dispatch.h
+++ b/lib/kernels/include/kernels/datatype_dispatch.h
@@ -1,7 +1,8 @@
 #ifndef _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H
 #define _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H
 
-#include "accessor.h"
+#include "op-attrs/datatype.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
@@ -33,7 +34,7 @@ struct DataTypeDispatch1 {
     template <typename... Args,
               typename Out = decltype(std::declval<F<DataType::FLOAT>>()(
                   std::declval<Args>()...))>
-    Out operator()(Args... args) const {
+    Out operator()(Args &&...args) const {
       return F<DT>{}(std::forward<Args>(args)...);
     }
   };
@@ -41,7 +42,7 @@ struct DataTypeDispatch1 {
   template <typename... Args,
             typename Out = decltype(std::declval<F<DataType::FLOAT>>()(
                 std::declval<Args>()...))>
-  Out operator()(DataType data_type, Args... args) {
+  Out operator()(DataType data_type, Args &&...args) {
     return dispatch<Type1Dispatch>(data_type, std::forward<Args>(args)...);
   }
 };
@@ -54,13 +55,13 @@ struct DataTypeDispatch2 {
     template <DataType OT>
     struct OutputType {
       template <typename... Args>
-      void operator()(Args... args) const {
+      void operator()(Args &&...args) const {
         F<IT, OT>{}(std::forward<Args>(args)...);
       }
     };
 
     template <typename... Args>
-    void operator()(DataType output_type, Args... args) const {
+    void operator()(DataType output_type, Args &&...args) const {
       dispatch<OutputType>(output_type, std::forward<Args>(args)...);
     }
   };
@@ -68,7 +69,7 @@ struct DataTypeDispatch2 {
   template <typename... Args>
   void operator()(DataType input_data_type,
                   DataType output_data_type,
-                  Args... args) {
+                  Args &&...args) {
     dispatch<InputType>(
         input_data_type, output_data_type, std::forward<Args>(args)...);
   }
diff --git a/lib/kernels/include/kernels/dropout_kernels.h b/lib/kernels/include/kernels/dropout_kernels.h
index c0e503be5b..4790540098 100644
--- a/lib/kernels/include/kernels/dropout_kernels.h
+++ b/lib/kernels/include/kernels/dropout_kernels.h
@@ -31,8 +31,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(DropoutPerDeviceState,
                                              reserveSpaceSize,
                                              dropoutStateSize);
 
-namespace Kernels {
-namespace Dropout {
+namespace Kernels::Dropout {
 
 DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                   float rate,
@@ -56,8 +55,7 @@ void cleanup_kernel(Allocator allocator,
                     ffDropoutDescriptor_t dropoutDesc,
                     void *dropoutStates);
 
-} // namespace Dropout
-} // namespace Kernels
+} // namespace Kernels::Dropout
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H
diff --git a/lib/kernels/include/kernels/element_binary_kernels.h b/lib/kernels/include/kernels/element_binary_kernels.h
index 41447e98e6..1017230fb0 100644
--- a/lib/kernels/include/kernels/element_binary_kernels.h
+++ b/lib/kernels/include/kernels/element_binary_kernels.h
@@ -26,8 +26,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ElementBinaryPerDeviceState,
                                              opDesc,
                                              reduceAddDesc);
 
-namespace Kernels {
-namespace ElementBinary {
+namespace Kernels::ElementBinary {
 
 ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                         OperatorType op_type,
@@ -58,8 +57,7 @@ void backward_kernel(ffStream_t stream,
                      bool broadcast_inputRHS,
                      PerDeviceFFHandle handle);
 
-} // namespace ElementBinary
-} // namespace Kernels
+} // namespace Kernels::ElementBinary
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h
index 8c6864b2d9..c338f465ac 100644
--- a/lib/kernels/include/kernels/element_unary_kernels.h
+++ b/lib/kernels/include/kernels/element_unary_kernels.h
@@ -19,8 +19,7 @@ FF_VISITABLE_STRUCT_NO_EQ(ElementUnaryPerDeviceState,
                           outputTensor,
                           actiDesc);
 
-namespace Kernels {
-namespace ElementUnary {
+namespace Kernels::ElementUnary {
 
 ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape,
                                        ArrayShape const &output_shape,
@@ -37,13 +36,12 @@ void backward_kernel(ffStream_t stream,
                      ElementUnaryPerDeviceState const &device_state,
                      ElementUnaryAttrs const &attrs,
                      PerDeviceFFHandle const &handle,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &input_grad,
                      GenericTensorAccessorR const &output,
-                     GenericTensorAccessorR const &output_grad);
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorR const &input,
+                     GenericTensorAccessorW const &input_grad);
 
-} // namespace ElementUnary
-} // namespace Kernels
+} // namespace Kernels::ElementUnary
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h
index 06582ca1d5..f5b2561b56 100644
--- a/lib/kernels/include/kernels/embedding_kernels.h
+++ b/lib/kernels/include/kernels/embedding_kernels.h
@@ -5,9 +5,7 @@
 #include "kernels/accessor.h"
 #include "op-attrs/ops/embedding.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Embedding {
+namespace FlexFlow::Kernels::Embedding {
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output,
@@ -19,11 +17,11 @@ void forward_kernel(ffStream_t stream,
                     int out_dim,
                     int batch_size);
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorW const &weight_grad,
-                     DataType input_data_type,
                      DataType output_data_type,
+                     DataType input_data_type,
                      std::optional<AggregateOp> aggr,
                      int in_dim,
                      int out_dim,
@@ -35,8 +33,6 @@ void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p);
 template <typename TD>
 __global__ void rand_generate_int(TD *ptr, size_t size, TD p);
 
-} // namespace Embedding
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Embedding
 
 #endif // _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H
diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h
index 3e600c48de..d60a1a5157 100644
--- a/lib/kernels/include/kernels/flat_kernels.h
+++ b/lib/kernels/include/kernels/flat_kernels.h
@@ -4,20 +4,17 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Flat {
+namespace FlexFlow::Kernels::Flat {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR input,
                     float *output_ptr);
-void backward_kernel(ffStream_t stream,
+
+void backward_kernel(cudaStream_t stream,
                      GenericTensorAccessorR input,
-                     float *input_grad_ptr,
-                     float const *output_grad_ptr);
+                     float const *output_grad_ptr,
+                     float *input_grad_ptr);
 
-} // namespace Flat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Flat
 
 #endif // _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H
diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h
index 13bf4b898a..af2da3b11f 100644
--- a/lib/kernels/include/kernels/gather_kernels.h
+++ b/lib/kernels/include/kernels/gather_kernels.h
@@ -15,8 +15,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState,
                                              handle,
                                              legion_dim);
 
-namespace Kernels {
-namespace Gather {
+namespace Kernels::Gather {
 
 void forward_kernel(ffStream_t stream,
                     GatherPerDeviceState const &m,
@@ -30,8 +29,7 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR const &index,
                      GenericTensorAccessorW const &input_grad);
 
-} // namespace Gather
-} // namespace Kernels
+} // namespace Kernels::Gather
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h
index be13d32879..a6ae87442a 100644
--- a/lib/kernels/include/kernels/layer_norm_kernels.h
+++ b/lib/kernels/include/kernels/layer_norm_kernels.h
@@ -30,8 +30,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState,
                                              bias,
                                              data_type);
 
-namespace Kernels {
-namespace LayerNorm {
+namespace Kernels::LayerNorm {
 
 // todo: this may have some problem.
 LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
@@ -57,8 +56,7 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorW const &gamma_grad,
                      GenericTensorAccessorW const &beta_grad);
 
-} // namespace LayerNorm
-} // namespace Kernels
+} // namespace Kernels::LayerNorm
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h
index 3128e39fd0..cd581b0a25 100644
--- a/lib/kernels/include/kernels/linear_kernels.h
+++ b/lib/kernels/include/kernels/linear_kernels.h
@@ -33,8 +33,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LinearPerDeviceState,
                                              weight_type,
                                              output_type);
 
-namespace Kernels {
-namespace Linear {
+namespace Kernels::Linear {
 
 LinearPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  float *one_ptr,
@@ -51,29 +50,28 @@ bool use_activation(Activation activation);
 
 void forward_kernel(ffStream_t stream,
                     LinearPerDeviceState const &m,
-                    void const *input_ptr,
-                    void *output_ptr,
-                    void const *filter_ptr,
-                    void const *bias_ptr,
+                    float const *input_ptr,
+                    float *output_ptr,
+                    float const *filter_ptr,
+                    float const *bias_ptr,
                     int in_dim,
                     int out_dim,
                     int batch_size);
 
 void backward_kernel(ffStream_t stream,
                      LinearPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
-                     void const *output_ptr,
-                     void *output_grad_ptr,
-                     void const *kernel_ptr,
-                     void *kernel_grad_ptr,
-                     void *bias_ptr,
+                     float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
+                     float const *kernel_ptr,
+                     float *kernel_grad_ptr,
+                     float *bias_ptr,
                      int in_dim,
                      int out_dim,
                      int batch_size);
 
-} // namespace Linear
-} // namespace Kernels
+} // namespace Kernels::Linear
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h
new file mode 100644
index 0000000000..cf6cfe35d1
--- /dev/null
+++ b/lib/kernels/include/kernels/local_cpu_allocator.h
@@ -0,0 +1,24 @@
+#include "kernels/allocation.h"
+#include <unordered_set>
+
+namespace FlexFlow {
+
+struct LocalCPUAllocator : public IAllocator {
+  LocalCPUAllocator() = default;
+  LocalCPUAllocator(LocalCPUAllocator const &) = delete;
+  LocalCPUAllocator(LocalCPUAllocator &&) = delete;
+  ~LocalCPUAllocator() = default;
+
+  void *allocate(size_t) override;
+  void deallocate(void *) override;
+
+  DeviceType get_allocation_device_type() const override;
+
+private:
+  std::unordered_map<void *, std::unique_ptr<void, decltype(&free)>> ptrs;
+};
+CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator);
+
+Allocator create_local_cpu_memory_allocator();
+
+} // namespace FlexFlow
diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h
index 18a4b6e78a..b8e0540974 100644
--- a/lib/kernels/include/kernels/local_cuda_allocator.h
+++ b/lib/kernels/include/kernels/local_cuda_allocator.h
@@ -12,6 +12,8 @@ struct LocalCudaAllocator : public IAllocator {
   void *allocate(size_t) override;
   void deallocate(void *) override;
 
+  DeviceType get_allocation_device_type() const override;
+
 private:
   std::unordered_set<void *> ptrs;
 };
diff --git a/lib/kernels/include/kernels/loss_function_kernels.h b/lib/kernels/include/kernels/loss_function_kernels.h
index bab404f884..9e0dbd4ba1 100644
--- a/lib/kernels/include/kernels/loss_function_kernels.h
+++ b/lib/kernels/include/kernels/loss_function_kernels.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H
 
-#include "kernels/device.h"
+#include "device.h"
 
 namespace FlexFlow {
 
diff --git a/lib/kernels/include/kernels/managed_ff_stream.h b/lib/kernels/include/kernels/managed_ff_stream.h
index 2f690b2eb3..26d5fb4911 100644
--- a/lib/kernels/include/kernels/managed_ff_stream.h
+++ b/lib/kernels/include/kernels/managed_ff_stream.h
@@ -19,6 +19,8 @@ struct ManagedFFStream {
 
   ffStream_t const &raw_stream() const;
 
+  void cleanup();
+
 private:
   ffStream_t *stream;
 };
diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
index 0a83a5eecb..035ea574de 100644
--- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h
+++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
@@ -7,7 +7,10 @@ namespace FlexFlow {
 
 struct ManagedPerDeviceFFHandle {
 public:
-  ManagedPerDeviceFFHandle();
+  ManagedPerDeviceFFHandle() = delete;
+
+  ManagedPerDeviceFFHandle(size_t workSpaceSize,
+                           bool allowTensorOpMathConversion);
 
   ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete;
   ManagedPerDeviceFFHandle &
@@ -21,6 +24,8 @@ struct ManagedPerDeviceFFHandle {
 
   PerDeviceFFHandle const &raw_handle() const;
 
+  void cleanup();
+
 private:
   PerDeviceFFHandle *handle;
 };
diff --git a/lib/kernels/include/kernels/metrics_kernels.h b/lib/kernels/include/kernels/metrics_kernels.h
index e4660808b9..d961ee7503 100644
--- a/lib/kernels/include/kernels/metrics_kernels.h
+++ b/lib/kernels/include/kernels/metrics_kernels.h
@@ -1,25 +1,24 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H
 
-#include "perf_metrics.h"
+#include "kernels/perf_metrics.h"
+#include "pcg/metric.h"
 
 namespace FlexFlow {
 
-void update_metrics_sparse_label_kernel(ffStream_t,
-                                        MetricsAttrs const &,
-                                        float const *logit_ptr,
-                                        int const *label_ptr,
-                                        int num_samples,
-                                        int num_classes,
-                                        PerfMetrics &perf_zc);
-void update_metrics_label_kernel(ffStream_t,
-                                 MetricsAttrs const &,
-                                 float const *logit_ptr,
-                                 float const *label_ptr,
-                                 int num_samples,
-                                 int num_classes,
-                                 PerfMetrics &perf_zc);
+void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr,
+                                                int const *label_ptr,
+                                                MetricsAttrs const *me,
+                                                int num_effective_samples,
+                                                int num_classes,
+                                                PerfMetrics &perf_zc);
 
+void update_metrics_label_kernel_wrapper(float const *logit_ptr,
+                                         float const *label_ptr,
+                                         MetricsAttrs const *me,
+                                         int num_samples,
+                                         int num_classes,
+                                         PerfMetrics &perf_zc);
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/nccl.h b/lib/kernels/include/kernels/nccl.h
index b8a6784676..042911d172 100644
--- a/lib/kernels/include/kernels/nccl.h
+++ b/lib/kernels/include/kernels/nccl.h
@@ -23,15 +23,11 @@ struct ncclUniqueId {};
 struct ncclComm_t {};
 #endif
 
-namespace FlexFlow {
-namespace Kernels {
-namespace NCCL {
+namespace FlexFlow::Kernels::NCCL {
 
 ncclUniqueId generate_unique_id();
 ncclComm_t create_comm(ncclUniqueId const &, int num_ranks, int my_rank);
 
-} // namespace NCCL
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::NCCL
 
 #endif
diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h
index 9ca6bf8e2b..3b5d292a5f 100644
--- a/lib/kernels/include/kernels/optimizer_kernels.h
+++ b/lib/kernels/include/kernels/optimizer_kernels.h
@@ -2,53 +2,91 @@
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 
 #include "device.h"
+#include "kernels/ff_handle.h"
+#include "kernels/nccl.h"
+#include "kernels/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
-void sgd_ps_update_task_gpu(ffStream_t,
-                            float lr,
-                            float momentum,
-                            bool nesterov,
+__global__ void sgd_update(size_t count,
+                           float lr,
+                           float weight_decay,
+                           float momentum,
+                           bool nesterov,
+                           float const *WGrad,
+                           float *V,
+                           float *W);
+
+class SGDOptimizer {
+public:
+  static __host__ void ps_update_task_gpu(SGDOptimizer const *op,
+                                          float const *w_grad_ptr,
+                                          size_t size,
+                                          int num_replicas,
+                                          float *w_ptr,
+                                          float *v_ptr);
+
+#ifdef FF_USE_NCCL
+  static __host__ void nccl_update_task_gpu(SGDOptimizer const *op,
+                                            PerDeviceOpState const *meta,
+                                            float const *w_grad_ptr,
+                                            size_t size,
+                                            float *w_ptr,
+                                            float *v_ptr);
+#endif
+
+public:
+  float lr;
+  float weight_decay;
+  float momentum;
+  bool nesterov;
+};
+
+__global__ void
+    add_kernel(int count, float scale, float const *src, float *dst);
+
+__global__ void scale_kernel(int count, float a, float b, float *ptr);
+
+__global__ void adam_update(int count,
+                            float alpha_t,
+                            float beta1,
+                            float beta2,
                             float weight_decay,
-                            float const *weight_grad_ptr,
-                            size_t size,
-                            int num_replicas,
-                            float *weight_ptr,
-                            float *sgd_v_ptr);
-
-void sgd_nccl_update_task_gpu(ffStream_t,
-                              float lr,
-                              float momentum,
-                              bool nesterov,
-                              float weight_decay PerDeviceFFHandle const &,
-                              float const *weight_grad_ptr,
-                              size_t size,
-                              float *weight_ptr,
-                              float *sgd_v_ptr);
-
-void adam_ps_update_task_gpu(ffStream_t,
-                             float alpha_t,
-                             float beta1,
-                             float beta2,
-                             float weight_decay,
-                             float epsilon,
-                             float const *weight_grad_ptr,
-                             float *adam_m_ptr,
-                             float *adam_v_ptr,
-                             float *weight_ptr);
-
-void adam_nccl_update_task_gpu(ffStream_t,
-                               float alpha_t,
-                               float beta1,
-                               float beta2,
-                               float weight_decay,
-                               float epsilon,
-                               PerDeviceFFHandle const &,
-                               float const *weight_grad_ptr,
-                               float *adam_m_ptr,
-                               float *adam_v_ptr,
-                               float *weight_ptr);
+                            float epsilon,
+                            float const *WGrad,
+                            float *M,
+                            float *V,
+                            float *W);
 
-} // namespace FlexFlow
+class AdamOptimizer {
+public:
+  static __host__ void ps_update_task_gpu(AdamOptimizer const *op,
+                                          float const *w_grad_ptr,
+                                          size_t size,
+                                          int num_replicas,
+                                          float *w_ptr,
+                                          float *v_ptr,
+                                          float *m_ptr);
 
+#ifdef FF_USE_NCCL
+  static __host__ void nccl_update_task_gpu(AdamOptimizer const *op,
+                                            PerDeviceOpState const *meta,
+                                            float const *w_grad_ptr,
+                                            size_t size,
+                                            float *w_ptr,
+                                            float *v_ptr,
+                                            float *m_ptr);
 #endif
+
+public:
+  float alpha;
+  float alpha_t;
+  float beta1;
+  float beta2;
+  float weight_decay;
+  float epsilon;
+};
+
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h
index 64ef1a1352..9a303952d0 100644
--- a/lib/kernels/include/kernels/partition_kernels.h
+++ b/lib/kernels/include/kernels/partition_kernels.h
@@ -13,8 +13,7 @@ struct RepartitionPerDeviceState {
 
 FF_VISITABLE_STRUCT_NO_EQ(RepartitionPerDeviceState, handle, data_type);
 
-namespace Kernels {
-namespace Repartition {
+namespace Kernels::Repartition {
 
 RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
                                       DataType data_type);
@@ -26,11 +25,10 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      RepartitionPerDeviceState const &m,
-                     GenericTensorAccessorW const &output_grad,
-                     GenericTensorAccessorR const &input_grad);
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorW const &input_grad);
 
-} // namespace Repartition
-} // namespace Kernels
+} // namespace Kernels::Repartition
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H
diff --git a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/per_device_op_state.variant.toml
rename to lib/kernels/include/kernels/per_device_op_state.variant.toml
diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h
index 798c0507f8..ad0a52efb9 100644
--- a/lib/kernels/include/kernels/pool_2d_kernels.h
+++ b/lib/kernels/include/kernels/pool_2d_kernels.h
@@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Pool2DPerDeviceState,
                                              poolDesc,
                                              relu);
 
-namespace Kernels {
-namespace Pool2D {
+namespace Kernels::Pool2D {
 
 Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  std::optional<Activation> activation,
@@ -68,15 +67,14 @@ void forward_kernel(ffStream_t stream,
                     void const *input_ptr,
                     void *output_ptr);
 
-void backward_kernel(ffStream_t stream,
+void backward_kernel(cudaStream_t stream,
                      Pool2DPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
                      void const *output_ptr,
-                     void const *output_grad_ptr);
+                     void const *output_grad_ptr,
+                     void const *input_ptr,
+                     void *input_grad_ptr);
 
-} // namespace Pool2D
-} // namespace Kernels
+} // namespace Kernels::Pool2D
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H
diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h
index 4287472875..cd3930ea1c 100644
--- a/lib/kernels/include/kernels/reduce_kernels.h
+++ b/lib/kernels/include/kernels/reduce_kernels.h
@@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT(ReducePerDeviceState,
                     op_type,
                     reduction_size);
 
-namespace Kernels {
-namespace Reduce {
+namespace Kernels::Reduce {
 
 ReducePerDeviceState init_kernel(PerDeviceFFHandle const &,
                                  OperatorType const &,
@@ -43,8 +42,7 @@ void backward_kernel(ffStream_t stream,
                      ReducePerDeviceState const &m,
                      float const *output_grad_ptr,
                      float *input_grad_ptr);
-} // namespace Reduce
-} // namespace Kernels
+} // namespace Kernels::Reduce
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h
index fb3baf215c..12553edd5e 100644
--- a/lib/kernels/include/kernels/reduction_kernels.h
+++ b/lib/kernels/include/kernels/reduction_kernels.h
@@ -4,9 +4,7 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Reduction {
+namespace FlexFlow::Kernels::Reduction {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
@@ -14,11 +12,9 @@ void forward_kernel(ffStream_t stream,
                     size_t num_replicas);
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input);
 
-} // namespace Reduction
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Reduction
 
 #endif // _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H
diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h
index 409fc81f44..7ed55cd1a1 100644
--- a/lib/kernels/include/kernels/replicate_kernels.h
+++ b/lib/kernels/include/kernels/replicate_kernels.h
@@ -4,21 +4,17 @@
 #include "device.h"
 #include "kernels/accessor.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Replicate {
+namespace FlexFlow::Kernels::Replicate {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input,
                      size_t num_replicas);
 
-} // namespace Replicate
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Replicate
 
 #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H
diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h
new file mode 100644
index 0000000000..1c7aa4ee4a
--- /dev/null
+++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
+
+#include "device.h"
+#include "kernels/accessor.h"
+
+namespace FlexFlow::Kernels::Replicate {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW &input,
+                         size_t num_replicas);
+
+} // namespace FlexFlow::Kernels::Replicate
+
+#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h
index a83caa6bea..6e19a9d251 100644
--- a/lib/kernels/include/kernels/reshape_kernels.h
+++ b/lib/kernels/include/kernels/reshape_kernels.h
@@ -13,8 +13,7 @@ struct ReshapePerDeviceState {
 
 FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type);
 
-namespace Kernels {
-namespace Reshape {
+namespace Kernels::Reshape {
 
 ReshapePerDeviceState init_kernel(DataType data_type);
 
@@ -25,11 +24,10 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      ReshapePerDeviceState const &per_device_state,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input);
 
-} // namespace Reshape
-} // namespace Kernels
+} // namespace Kernels::Reshape
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reverse_kernels.h b/lib/kernels/include/kernels/reverse_kernels.h
index 42a83ae219..deb5b22155 100644
--- a/lib/kernels/include/kernels/reverse_kernels.h
+++ b/lib/kernels/include/kernels/reverse_kernels.h
@@ -3,9 +3,7 @@
 
 #include "device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Reverse {
+namespace FlexFlow::Kernels::Reverse {
 
 void forward_kernel(ffStream_t stream,
                     float const *in_ptr,
@@ -23,8 +21,6 @@ void backward_kernel(ffStream_t stream,
                      coord_t in_blk_size,
                      coord_t input_size);
 
-} // namespace Reverse
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Reverse
 
 #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h
new file mode 100644
index 0000000000..35af06aafb
--- /dev/null
+++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Reverse {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
+                        GenericTensorAccessorW &output_accessor);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor,
+                         GenericTensorAccessorW &input_accessor);
+} // namespace FlexFlow::Kernels::Reverse
+
+#endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h
index 061230ec52..520ea61b64 100644
--- a/lib/kernels/include/kernels/softmax_kernels.h
+++ b/lib/kernels/include/kernels/softmax_kernels.h
@@ -15,8 +15,7 @@ struct SoftmaxPerDeviceState {
 
 FF_VISITABLE_STRUCT(SoftmaxPerDeviceState, handle, inputTensor, dim);
 
-namespace Kernels {
-namespace Softmax {
+namespace Kernels::Softmax {
 
 SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
                                   int dim,
@@ -31,12 +30,11 @@ void forward_kernel(ffStream_t stream,
                     float *output_ptr);
 
 void backward_kernel(ffStream_t stream,
-                     float *input_grad_ptr,
                      float const *output_grad_ptr,
+                     float *input_grad_ptr,
                      size_t num_elements);
 
-} // namespace Softmax
-} // namespace Kernels
+} // namespace Kernels::Softmax
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/split_kernels.h b/lib/kernels/include/kernels/split_kernels.h
index 36434d4be8..538b9602c2 100644
--- a/lib/kernels/include/kernels/split_kernels.h
+++ b/lib/kernels/include/kernels/split_kernels.h
@@ -3,10 +3,7 @@
 
 #include "device.h"
 
-namespace FlexFlow {
-
-namespace Kernels {
-namespace Split {
+namespace FlexFlow::Kernels::Split {
 void forward_kernel(ffStream_t stream,
                     float **out_ptrs,
                     float const *in_ptr,
@@ -22,8 +19,6 @@ void backward_kernel(ffStream_t stream,
                      coord_t num_blks,
                      int numOutputs);
 
-} // namespace Split
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Split
 
 #endif // _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H
diff --git a/lib/kernels/include/kernels/topk_kernels.h b/lib/kernels/include/kernels/topk_kernels.h
index ae1c739f6c..6f33381e1a 100644
--- a/lib/kernels/include/kernels/topk_kernels.h
+++ b/lib/kernels/include/kernels/topk_kernels.h
@@ -12,8 +12,7 @@ struct TopKPerDeviceState {
 
 FF_VISITABLE_STRUCT(TopKPerDeviceState, sorted);
 
-namespace Kernels {
-namespace TopK {
+namespace Kernels::TopK {
 
 TopKPerDeviceState init_kernel(bool sorted);
 
@@ -35,8 +34,7 @@ void backward_kernel(ffStream_t stream,
                      int length,
                      int k);
 
-} // namespace TopK
-} // namespace Kernels
+} // namespace Kernels::TopK
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H
diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h
index 56da81ba2b..dbf78826cb 100644
--- a/lib/kernels/include/kernels/transpose_kernels.h
+++ b/lib/kernels/include/kernels/transpose_kernels.h
@@ -16,8 +16,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(TransposePerDeviceState,
                                              num_dim,
                                              perm);
 
-namespace Kernels {
-namespace Transpose {
+namespace Kernels::Transpose {
 
 TransposePerDeviceState init_kernel(int num_dim,
                                     std::vector<ff_dim_t> const &perm);
@@ -29,11 +28,10 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      TransposePerDeviceState const &m,
-                     GenericTensorAccessorW const &in_grad,
-                     GenericTensorAccessorR const &out_grad);
+                     GenericTensorAccessorR const &out_grad,
+                     GenericTensorAccessorW const &in_grad);
 
-} // namespace Transpose
-} // namespace Kernels
+} // namespace Kernels::Transpose
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
index 27b7eb390d..e56bded737 100644
--- a/lib/kernels/src/accessor.cc
+++ b/lib/kernels/src/accessor.cc
@@ -1,7 +1,68 @@
 #include "kernels/accessor.h"
+#include "kernels/allocation.h"
+#include "kernels/datatype_dispatch.h"
 
 namespace FlexFlow {
 
+void copy_accessor_data_to_l_from_r(
+    GenericTensorAccessorW &dst_accessor,
+    GenericTensorAccessorR const &src_accessor) {
+  size_t num_bytes = dst_accessor.shape.get_volume() *
+                     size_of_datatype(dst_accessor.data_type);
+
+  DeviceType dst_device_type = dst_accessor.device_type;
+  DeviceType src_device_type = src_accessor.device_type;
+
+  if (src_device_type == DeviceType::CPU &&
+      dst_device_type == DeviceType::CPU) {
+    memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes);
+  } else if (src_device_type == DeviceType::CPU &&
+             dst_device_type == DeviceType::GPU) {
+    checkCUDA(cudaMemcpy(
+        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice));
+  } else if (src_device_type == DeviceType::GPU &&
+             dst_device_type == DeviceType::CPU) {
+    checkCUDA(cudaMemcpy(
+        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost));
+  } else {
+    assert(src_device_type == DeviceType::GPU);
+    assert(dst_device_type == DeviceType::GPU);
+    checkCUDA(cudaMemcpy(dst_accessor.ptr,
+                         src_accessor.ptr,
+                         num_bytes,
+                         cudaMemcpyDeviceToDevice));
+  }
+}
+
+GenericTensorAccessorW::operator GenericTensorAccessorR() const {
+  return read_only_accessor_from_write_accessor(*this);
+}
+
+GenericTensorAccessorW::GenericTensorAccessorW(
+    DataType data_type,
+    ArrayShape const &shape,
+    void *ptr,
+    DeviceType device_type = DeviceType::GPU)
+    : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {}
+
+std::tuple<DataType const &,
+           ArrayShape const &,
+           void *const &,
+           DeviceType const &>
+    GenericTensorAccessorW::tie() const {
+  return std::tie(this->data_type, this->shape, this->ptr, this->device_type);
+}
+
+bool GenericTensorAccessorW::operator==(
+    GenericTensorAccessorW const &other) const {
+  return this->tie() == other.tie();
+}
+
+bool GenericTensorAccessorW::operator!=(
+    GenericTensorAccessorW const &other) const {
+  return this->tie() != other.tie();
+}
+
 int32_t *GenericTensorAccessorW::get_int32_ptr() const {
   return this->get<DataType::INT32>();
 }
@@ -33,6 +94,31 @@ std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) {
   return (s << fmt::to_string(a));
 }
 
+GenericTensorAccessorR::GenericTensorAccessorR(
+    DataType data_type,
+    ArrayShape const &shape,
+    void const *ptr,
+    DeviceType device_type = DeviceType::GPU)
+    : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {}
+
+std::tuple<DataType const &,
+           ArrayShape const &,
+           void const *const &,
+           DeviceType const &>
+    GenericTensorAccessorR::tie() const {
+  return std::tie(this->data_type, this->shape, this->ptr, this->device_type);
+}
+
+bool GenericTensorAccessorR::operator==(
+    GenericTensorAccessorR const &other) const {
+  return this->tie() == other.tie();
+}
+
+bool GenericTensorAccessorR::operator!=(
+    GenericTensorAccessorR const &other) const {
+  return this->tie() != other.tie();
+}
+
 int32_t const *GenericTensorAccessorR::get_int32_ptr() const {
   return this->get<DataType::INT32>();
 }
@@ -64,51 +150,6 @@ std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) {
   return (s << fmt::to_string(a));
 }
 
-int32_t *get_int32_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::INT32>(a);
-}
-
-int64_t *get_int64_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::INT64>(a);
-}
-
-float *get_float_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-double *get_double_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-half *get_half_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::HALF>(a);
-}
-
-std::vector<int32_t *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::INT32>(a);
-}
-
-std::vector<int64_t *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::INT64>(a);
-}
-
-std::vector<float *>
-    get_float_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-std::vector<double *>
-    get_double_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-std::vector<half *>
-    get_half_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::HALF>(a);
-}
-
 int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) {
   return get<DataType::INT32>(a);
 }
@@ -156,22 +197,17 @@ std::vector<half const *>
 
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &writable) {
-  return GenericTensorAccessorR{
-      writable.data_type, writable.shape, req<void const *>(writable.ptr)};
+  return GenericTensorAccessorR{writable.data_type,
+                                writable.shape,
+                                req<void const *>(writable.ptr),
+                                writable.device_type};
 }
 
-bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
-                              GenericTensorAccessorW const &acc2) {
+bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
+                              GenericTensorAccessorR const &acc2) {
   return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type;
 }
 
-bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype) {
-  return accessor.shape == expected_shape &&
-         accessor.data_type == expected_dtype;
-}
-
 bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
                              ArrayShape const &expected_shape,
                              DataType const &expected_dtype) {
@@ -184,9 +220,4 @@ std::pair<ArrayShape, DataType>
   return std::make_pair(accessor.shape, accessor.data_type);
 }
 
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorW const &accessor) {
-  return std::make_pair(accessor.shape, accessor.data_type);
-}
-
 } // namespace FlexFlow
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
index ccd88580db..733146851a 100644
--- a/lib/kernels/src/allocation.cc
+++ b/lib/kernels/src/allocation.cc
@@ -11,10 +11,17 @@ void Allocator::deallocate(void *ptr) {
   this->i_allocator->deallocate(ptr);
 }
 
+DeviceType Allocator::get_allocation_device_type() const {
+  return this->i_allocator->get_allocation_device_type();
+}
+
 GenericTensorAccessorW
     Allocator::allocate_tensor(TensorShape const &tensor_shape) {
   void *ptr = this->allocate(get_size_in_bytes(tensor_shape));
-  return {tensor_shape.data_type, tensor_shape, ptr};
+  return {tensor_shape.data_type,
+          tensor_shape,
+          ptr,
+          this->get_allocation_device_type()};
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index d5e2f1167d..5c18a9ab5a 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -53,6 +53,7 @@ std::size_t ArrayShape::at(ff_dim_t idx) const {
 ArrayShape ArrayShape::sub_shape(
     std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
     std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
+
   NOT_IMPLEMENTED();
 }
 
diff --git a/lib/kernels/src/copy_tensor_accessor.cc b/lib/kernels/src/copy_tensor_accessor.cc
new file mode 100644
index 0000000000..6a3ad8033a
--- /dev/null
+++ b/lib/kernels/src/copy_tensor_accessor.cc
@@ -0,0 +1,48 @@
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow {
+
+template <DataType DT>
+struct CopyTensorAccessorW {
+  GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor,
+                                    Allocator &allocator) {
+    TensorShape shape =
+        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+
+    return dst_accessor;
+  }
+};
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
+                           Allocator &allocator) {
+  return DataTypeDispatch1<CopyTensorAccessorW>{}(
+      src_accessor.data_type, src_accessor, allocator);
+}
+
+template <DataType DT>
+struct CopyTensorAccessorR {
+  GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor,
+                                    Allocator &allocator) {
+    TensorShape shape =
+        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+
+    return read_only_accessor_from_write_accessor(dst_accessor);
+  }
+};
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
+                           Allocator &allocator) {
+  return DataTypeDispatch1<CopyTensorAccessorR>{}(
+      src_accessor.data_type, src_accessor, allocator);
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc
new file mode 100644
index 0000000000..08f5552afc
--- /dev/null
+++ b/lib/kernels/src/cpu/cast_kernels.cc
@@ -0,0 +1,51 @@
+#include "kernels/cast_kernels_cpu.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow::Kernels::Cast {
+
+template <typename IDT, typename ODT>
+void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) {
+  for (size_t i = 0; i < volume; ++i) {
+    output[i] = static_cast<ODT>(input[i]);
+  }
+}
+
+template <typename IDT, typename ODT>
+void cpu_cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) {
+  for (size_t i = 0; i < volume; i++) {
+    output[i] = static_cast<ODT>(input[i]) + beta * output[i];
+  }
+}
+
+template <DataType IDT, DataType ODT>
+struct CPUForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    size_t volume = input.shape.get_volume();
+    cpu_cast_forward(input.get<IDT>(), output.get<ODT>(), volume);
+  }
+};
+
+template <DataType IDT, DataType ODT>
+struct CPUBackwardKernel {
+  void operator()(GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
+    size_t volume = output.shape.get_volume();
+    cpu_cast_backward(
+        output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  DataTypeDispatch2<CPUForwardKernel>{}(
+      input.data_type, output.data_type, input, output);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW const &input) {
+  DataTypeDispatch2<CPUBackwardKernel>{}(
+      output.data_type, input.data_type, output, input);
+}
+
+} // namespace FlexFlow::Kernels::Cast
diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc
new file mode 100644
index 0000000000..d0be1f9f2d
--- /dev/null
+++ b/lib/kernels/src/cpu/combine_kernels.cc
@@ -0,0 +1,38 @@
+#include "kernels/combine_kernels_cpu.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow::Kernels::Combine {
+
+template <DataType DT>
+struct CPUForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    memcpy(output.get<DT>(),
+           input.get<DT>(),
+           input.shape.get_volume() * size_of_datatype(DT));
+  }
+};
+
+template <DataType DT>
+struct CPUBackwardKernel {
+  void operator()(GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorW const &input_grad) {
+    size_t num_elements = output_grad.shape.get_volume();
+    for (int i = 0; i < num_elements; ++i) {
+      input_grad.get<DT>()[i] += output_grad.get<DT>()[i];
+    }
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad) {
+  DataTypeDispatch1<CPUBackwardKernel>{}(
+      input_grad.data_type, output_grad, input_grad);
+}
+
+} // namespace FlexFlow::Kernels::Combine
diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc
new file mode 100644
index 0000000000..cfcb44dac5
--- /dev/null
+++ b/lib/kernels/src/cpu/replicate_kernels.cc
@@ -0,0 +1,44 @@
+#include "kernels/datatype_dispatch.h"
+#include "kernels/replicate_kernels_cpu.h"
+
+namespace FlexFlow::Kernels::Replicate {
+
+template <DataType DT>
+struct CPUForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW &output) {
+    memcpy(output.get<DT>(),
+           input.get<DT>(),
+           input.shape.num_elements() * size_of_datatype(DT));
+  }
+};
+
+template <DataType DT>
+struct CPUBackwardKernel {
+  void operator()(GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW &input,
+                  size_t num_replicas) {
+    using T = real_type_t<DT>;
+    for (int i = 0; i < input.shape.num_elements(); i++) {
+      T cur_sum = 0;
+      for (int j = 0; j < num_replicas; j++) {
+        cur_sum += output.at<DT>({i, j});
+      }
+      input.at<DT>({i}) = cur_sum;
+    }
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW &output) {
+  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW &input,
+                         size_t num_replicas) {
+  DataTypeDispatch1<CPUBackwardKernel>{}(
+      input.data_type, output, input, num_replicas);
+}
+
+} // namespace FlexFlow::Kernels::Replicate
diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc
new file mode 100644
index 0000000000..bc73c80e9e
--- /dev/null
+++ b/lib/kernels/src/cpu/reverse_kernels.cc
@@ -0,0 +1,43 @@
+#include "kernels/datatype_dispatch.h"
+#include "kernels/reverse_kernels_cpu.h"
+#include <algorithm>
+#include <vector>
+
+namespace FlexFlow::Kernels::Reverse {
+
+template <DataType DT>
+struct CPUReverseForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW &output) {
+    assert(input.data_type == DT && output.data_type == DT);
+
+    int num_out_blocks = input.shape.at(legion_dim_t(0));
+    int reverse_dim_size = input.shape.at(legion_dim_t(1));
+    int in_block_size = input.shape.at(legion_dim_t(2));
+
+    for (int block_idx = 0; block_idx < num_out_blocks; block_idx++) {
+      for (int rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) {
+        for (int i = 0; i < in_block_size; i++) {
+          output.at<DT>({block_idx, rev_idx, i}) =
+              input.at<DT>({num_out_blocks - 1 - block_idx,
+                            reverse_dim_size - 1 - rev_idx,
+                            in_block_size - 1 - i});
+        }
+      }
+    }
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
+                        GenericTensorAccessorW &output_accessor) {
+  DataTypeDispatch1<CPUReverseForwardKernel>{}(
+      input_accessor.data_type, input_accessor, output_accessor);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor,
+                         GenericTensorAccessorW &input_accessor) {
+  DataTypeDispatch1<CPUReverseForwardKernel>{}(
+      output_accessor.data_type, output_accessor, input_accessor);
+}
+
+} // namespace FlexFlow::Kernels::Reverse
diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu
index 2ff02038f4..b30cf6a663 100644
--- a/lib/kernels/src/cuda/cuda_helper.cu
+++ b/lib/kernels/src/cuda/cuda_helper.cu
@@ -29,13 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) {
 #error "Unknown device, please make sure if CUDA is enabled"
 #endif
 
-__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
+__global__ void scale_kernel(float *ptr, size_t size, float a, float b) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = (b - a) * ptr[i] + a;
   }
 }
 
-__global__ void ones_kernel(float *ptr, coord_t size) {
+__global__ void ones_kernel(float *ptr, size_t size) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = 1.0f;
   }
@@ -49,7 +49,7 @@ __global__ void assign_kernel(DT *ptr, size_t size, DT value) {
 }
 
 template <typename DT>
-__global__ void copy_kernel(DT *dst, const DT *src, coord_t size) {
+__global__ void copy_kernel(DT *dst, const DT *src, size_t size) {
   CUDA_KERNEL_LOOP(i, size) {
     dst[i] = src[i];
   }
@@ -281,11 +281,11 @@ template __global__ void
     add_kernel<bool>(bool *dst, bool const *src, unsigned long size);
 
 template __global__ void
-    copy_kernel<float>(float *dst, float const *src, coord_t size);
+    copy_kernel<float>(float *dst, float const *src, size_t size);
 template __global__ void
-    copy_kernel<int32_t>(int32_t *dst, int32_t const *src, coord_t size);
+    copy_kernel<int32_t>(int32_t *dst, int32_t const *src, size_t size);
 template __global__ void
-    copy_kernel<int64_t>(int64_t *dst, int64_t const *src, coord_t size);
+    copy_kernel<int64_t>(int64_t *dst, int64_t const *src, size_t size);
 
 template __global__ void apply_add_with_scale<float>(float *data_ptr,
                                                      float const *grad_ptr,
diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu
index e6a614ba70..c83e9f0a94 100644
--- a/lib/kernels/src/cuda/embedding_kernels.cu
+++ b/lib/kernels/src/cuda/embedding_kernels.cu
@@ -17,12 +17,11 @@
 #include "kernels/datatype_dispatch.h"
 #include "kernels/embedding_kernels.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Embedding {
+namespace FlexFlow::Kernels::Embedding {
 
 void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) {
   cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
 
   // Randomly initialize the intput tensor to avoid out of index range issues
   rand_generate_int<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
@@ -31,36 +30,14 @@ void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) {
 
 void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p) {
   cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
 
   // Randomly initialize the intput tensor to avoid out of index range issues
   rand_generate_int<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
       ptr, size, p);
 }
 
-template <typename TI, typename TD>
-__global__ void embed_forward_no_aggr(
-    TI const *input, TD *output, TD const *embed, int out_dim, int batch_size);
-template <typename TI, typename TD>
-__global__ void embed_forward_with_aggr(TI const *input,
-                                        TD *output,
-                                        TD const *embed,
-                                        int out_dim,
-                                        int in_dim,
-                                        int batch_size,
-                                        std::optional<AggregateOp> aggr);
-template <typename TI, typename TD>
-__global__ void embed_backward_no_aggr(
-    TI const *input, TD const *output, TD *embed, int out_dim, int batch_size);
-template <typename TI, typename TD>
-__global__ void embed_backward_with_aggr(TI const *input,
-                                         TD const *output,
-                                         TD *embed,
-                                         int out_dim,
-                                         int in_dim,
-                                         int batch_size,
-                                         std::optional<AggregateOp> aggr);
-
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_no_aggr(int32_t const *input,
                                       TD *output,
                                       TD const *embed,
@@ -75,7 +52,7 @@ __global__ void embed_forward_no_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_no_aggr(int64_t const *input,
                                       TD *output,
                                       TD const *embed,
@@ -90,14 +67,14 @@ __global__ void embed_forward_no_aggr(int64_t const *input,
   }
 }
 
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_with_aggr(int32_t const *input,
                                         TD *output,
                                         TD const *embed,
                                         int out_dim,
                                         int in_dim,
                                         int batch_size,
-                                        std::optional<AggregateOp> aggr) {
+                                        AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     output[i] = 0;
@@ -115,14 +92,14 @@ __global__ void embed_forward_with_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_with_aggr(int64_t const *input,
                                         TD *output,
                                         TD const *embed,
                                         int out_dim,
                                         int in_dim,
                                         int batch_size,
-                                        std::optional<AggregateOp> aggr) {
+                                        AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     output[i] = 0;
@@ -140,7 +117,7 @@ __global__ void embed_forward_with_aggr(int64_t const *input,
   }
 }
 
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_no_aggr(int32_t const *input,
                                        TD const *output,
                                        TD *embed,
@@ -154,7 +131,7 @@ __global__ void embed_backward_no_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_no_aggr(int64_t const *input,
                                        TD const *output,
                                        TD *embed,
@@ -171,11 +148,11 @@ __global__ void embed_backward_no_aggr(int64_t const *input,
 // Specialization for half type
 
 template <>
-__global__ void embed_backward_no_aggr<int32_t, half>(int32_t const *input,
-                                                      half const *output,
-                                                      half *embed,
-                                                      int out_dim,
-                                                      int batch_size) {
+__global__ void embed_backward_no_aggr<half>(int32_t const *input,
+                                             half const *output,
+                                             half *embed,
+                                             int out_dim,
+                                             int batch_size) {
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
     int off = i % out_dim;
@@ -192,11 +169,11 @@ __global__ void embed_backward_no_aggr<int32_t, half>(int32_t const *input,
 }
 
 template <>
-__global__ void embed_backward_no_aggr<int64_t, half>(int64_t const *input,
-                                                      half const *output,
-                                                      half *embed,
-                                                      int out_dim,
-                                                      int batch_size) {
+__global__ void embed_backward_no_aggr<half>(int64_t const *input,
+                                             half const *output,
+                                             half *embed,
+                                             int out_dim,
+                                             int batch_size) {
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
     int off = i % out_dim;
@@ -212,14 +189,14 @@ __global__ void embed_backward_no_aggr<int64_t, half>(int64_t const *input,
   }
 }
 
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_with_aggr(int32_t const *input,
                                          TD const *output,
                                          TD *embed,
                                          int out_dim,
                                          int in_dim,
                                          int batch_size,
-                                         std::optional<AggregateOp> aggr) {
+                                         AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -238,14 +215,14 @@ __global__ void embed_backward_with_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_with_aggr(int64_t const *input,
                                          TD const *output,
                                          TD *embed,
                                          int out_dim,
                                          int in_dim,
                                          int batch_size,
-                                         std::optional<AggregateOp> aggr) {
+                                         AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -267,14 +244,13 @@ __global__ void embed_backward_with_aggr(int64_t const *input,
 // Specialization for half type
 
 template <>
-__global__ void
-    embed_backward_with_aggr<int32_t, half>(int32_t const *input,
-                                            half const *output,
-                                            half *embed,
-                                            int out_dim,
-                                            int in_dim,
-                                            int batch_size,
-                                            std::optional<AggregateOp> aggr) {
+__global__ void embed_backward_with_aggr<half>(int32_t const *input,
+                                               half const *output,
+                                               half *embed,
+                                               int out_dim,
+                                               int in_dim,
+                                               int batch_size,
+                                               AggregateOp aggr) {
   half scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -301,14 +277,13 @@ __global__ void
 }
 
 template <>
-__global__ void
-    embed_backward_with_aggr<int64_t, half>(int64_t const *input,
-                                            half const *output,
-                                            half *embed,
-                                            int out_dim,
-                                            int in_dim,
-                                            int batch_size,
-                                            std::optional<AggregateOp> aggr) {
+__global__ void embed_backward_with_aggr<half>(int64_t const *input,
+                                               half const *output,
+                                               half *embed,
+                                               int out_dim,
+                                               int in_dim,
+                                               int batch_size,
+                                               AggregateOp aggr) {
   half scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -351,35 +326,219 @@ struct ForwardKernel {
                   int in_dim,
                   int out_dim,
                   int batch_size) {
-    assert(input.data_type == DataType::INT32 ||
-           input.data_type == DataType::INT64);
-    assert(weight.data_type == DataType::HALF ||
-           weight.data_type == DataType::FLOAT ||
-           weight.data_type == DataType::DOUBLE);
+    throw mk_runtime_error(fmt::format(
+        "Invalid type combination: input type {} and output type {}", TI, TD));
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT32, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<float><<<GET_BLOCKS(output.shape.get_volume()),
+                                     CUDA_NUM_THREADS,
+                                     0,
+                                     stream>>>(input.get<DataType::INT32>(),
+                                               output.get<DataType::FLOAT>(),
+                                               weight.get<DataType::FLOAT>(),
+                                               out_dim,
+                                               batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<float><<<GET_BLOCKS(output.shape.get_volume()),
+                                       CUDA_NUM_THREADS,
+                                       0,
+                                       stream>>>(input.get<DataType::INT32>(),
+                                                 output.get<DataType::FLOAT>(),
+                                                 weight.get<DataType::FLOAT>(),
+                                                 out_dim,
+                                                 in_dim,
+                                                 batch_size,
+                                                 aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT32, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<half><<<GET_BLOCKS(output.shape.get_volume()),
+                                    CUDA_NUM_THREADS,
+                                    0,
+                                    stream>>>(input.get<DataType::INT32>(),
+                                              output.get<DataType::HALF>(),
+                                              weight.get<DataType::HALF>(),
+                                              out_dim,
+                                              batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<half><<<GET_BLOCKS(output.shape.get_volume()),
+                                      CUDA_NUM_THREADS,
+                                      0,
+                                      stream>>>(input.get<DataType::INT32>(),
+                                                output.get<DataType::HALF>(),
+                                                weight.get<DataType::HALF>(),
+                                                out_dim,
+                                                in_dim,
+                                                batch_size,
+                                                aggr.value());
+    }
+  }
+};
 
+template <>
+struct ForwardKernel<DataType::INT32, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
     if (!aggr.has_value()) {
-      embed_forward_no_aggr<real_type_t<TI>, real_type_t<TD>>
+      embed_forward_no_aggr<double><<<GET_BLOCKS(output.shape.get_volume()),
+                                      CUDA_NUM_THREADS,
+                                      0,
+                                      stream>>>(input.get<DataType::INT32>(),
+                                                output.get<DataType::DOUBLE>(),
+                                                weight.get<DataType::DOUBLE>(),
+                                                out_dim,
+                                                batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<double>
           <<<GET_BLOCKS(output.shape.get_volume()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight.get<TD>(),
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
                        out_dim,
-                       batch_size);
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT64, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<float><<<GET_BLOCKS(output.shape.get_volume()),
+                                     CUDA_NUM_THREADS,
+                                     0,
+                                     stream>>>(input.get<DataType::INT64>(),
+                                               output.get<DataType::FLOAT>(),
+                                               weight.get<DataType::FLOAT>(),
+                                               out_dim,
+                                               batch_size);
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
-      embed_forward_with_aggr<real_type_t<TI>, real_type_t<TD>>
+      embed_forward_with_aggr<float><<<GET_BLOCKS(output.shape.get_volume()),
+                                       CUDA_NUM_THREADS,
+                                       0,
+                                       stream>>>(input.get<DataType::INT64>(),
+                                                 output.get<DataType::FLOAT>(),
+                                                 weight.get<DataType::FLOAT>(),
+                                                 out_dim,
+                                                 in_dim,
+                                                 batch_size,
+                                                 aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT64, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<half><<<GET_BLOCKS(output.shape.get_volume()),
+                                    CUDA_NUM_THREADS,
+                                    0,
+                                    stream>>>(input.get<DataType::INT64>(),
+                                              output.get<DataType::HALF>(),
+                                              weight.get<DataType::HALF>(),
+                                              out_dim,
+                                              batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<half><<<GET_BLOCKS(output.shape.get_volume()),
+                                      CUDA_NUM_THREADS,
+                                      0,
+                                      stream>>>(input.get<DataType::INT64>(),
+                                                output.get<DataType::HALF>(),
+                                                weight.get<DataType::HALF>(),
+                                                out_dim,
+                                                in_dim,
+                                                batch_size,
+                                                aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT64, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<double><<<GET_BLOCKS(output.shape.get_volume()),
+                                      CUDA_NUM_THREADS,
+                                      0,
+                                      stream>>>(input.get<DataType::INT64>(),
+                                                output.get<DataType::DOUBLE>(),
+                                                weight.get<DataType::DOUBLE>(),
+                                                out_dim,
+                                                batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<double>
           <<<GET_BLOCKS(output.shape.get_volume()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight.get<TD>(),
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
                        out_dim,
                        in_dim,
                        batch_size,
-                       aggr);
+                       aggr.value());
     }
   }
 };
@@ -388,39 +547,229 @@ template <DataType TI, DataType TD>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
                   std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
                   GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    throw mk_runtime_error(fmt::format(
+        "Invalid type combination: input type {} and output type {}", TI, TD));
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT32, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
                   GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &weight_grad,
                   int in_dim,
                   int out_dim,
                   int batch_size) {
-    assert(input.data_type == DataType::INT32 ||
-           input.data_type == DataType::INT64);
-    assert(output.data_type == DataType::HALF ||
-           output.data_type == DataType::FLOAT ||
-           output.data_type == DataType::DOUBLE);
     if (!aggr.has_value()) {
-      embed_backward_no_aggr<real_type_t<TI>, real_type_t<TD>>
+      embed_backward_no_aggr<float>
           <<<GET_BLOCKS(output.shape.get_volume()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight_grad.get<TD>(),
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
                        out_dim,
                        batch_size);
     } else {
-      embed_backward_with_aggr<real_type_t<TI>, real_type_t<TD>>
+      embed_backward_with_aggr<float>
           <<<GET_BLOCKS(output.shape.get_volume()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight_grad.get<TD>(),
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
                        out_dim,
                        in_dim,
                        batch_size,
-                       aggr);
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT32, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT32, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT64, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT64, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT64, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
     }
   }
 };
@@ -448,27 +797,25 @@ void forward_kernel(ffStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorW const &weight_grad,
-                     DataType input_data_type,
                      DataType output_data_type,
+                     DataType input_data_type,
                      std::optional<AggregateOp> aggr,
                      int in_dim,
                      int out_dim,
                      int batch_size) {
-  DataTypeDispatch2<BackwardKernel>{}(input_data_type,
-                                      output_data_type,
+  DataTypeDispatch2<BackwardKernel>{}(output_data_type,
+                                      input_data_type,
                                       stream,
                                       aggr,
-                                      input,
                                       output,
+                                      input,
                                       weight_grad,
                                       in_dim,
                                       out_dim,
                                       batch_size);
 }
 
-} // namespace Embedding
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Embedding
diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu
index 2e037eb472..0250f829ec 100644
--- a/lib/kernels/src/cuda/metrics_functions.cu
+++ b/lib/kernels/src/cuda/metrics_functions.cu
@@ -13,17 +13,42 @@
  * limitations under the License.
  */
 
-#include "flexflow/model.h"
-#include "flexflow/utils/cuda_helper.h"
+#include "device.h"
+#include "kernels/metrics_kernels.h"
+#include "kernels/perf_metrics.h"
+#include "pcg/metric.h"
 
 namespace FlexFlow {
 
+struct CUDAPerfMetrics {
+  int train_all;
+  int train_correct;
+  float cce_loss;
+  float sparse_cce_loss;
+  float mse_loss;
+  float rmse_loss;
+  float mae_loss;
+  double start_time;
+  double current_time;
+
+  CUDAPerfMetrics() = delete;
+  CUDAPerfMetrics(PerfMetrics const &perf)
+      : train_all(perf.train_all),
+        train_correct(perf.train_correct.value_or(-1)),
+        cce_loss(perf.cce_loss.value_or(-1)),
+        sparse_cce_loss(perf.sparse_cce_loss.value_or(-1)),
+        mse_loss(perf.mse_loss.value_or(-1)),
+        rmse_loss(perf.rmse_loss.value_or(-1)),
+        mae_loss(perf.mae_loss.value_or(-1)), start_time(perf.start_time),
+        current_time(perf.current_time) {}
+};
+
 float const LOG_MIN_VALUE = 0.00000001f;
 
 __global__ void update_metrics_sparse_label_kernel(float const *logits,
                                                    int const *labels,
-                                                   PerfMetrics *perf,
-                                                   const Metrics metrics,
+                                                   CUDAPerfMetrics *perf,
+                                                   const MetricsAttrs metrics,
                                                    int num_samples,
                                                    int num_classes) {
   CUDA_KERNEL_LOOP(b, num_samples) {
@@ -72,8 +97,8 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits,
 
 __global__ void update_metrics_label_kernel(float const *logits,
                                             float const *labels,
-                                            PerfMetrics *perf,
-                                            const Metrics metrics,
+                                            CUDAPerfMetrics *perf,
+                                            const MetricsAttrs metrics,
                                             int num_samples,
                                             int num_classes) {
   CUDA_KERNEL_LOOP(b, num_samples) {
@@ -136,17 +161,17 @@ __global__ void update_metrics_label_kernel(float const *logits,
   }
 }
 
-void Metrics::update_metrics_sparse_label_kernel_wrapper(
-    float const *logit_ptr,
-    int const *label_ptr,
-    Metrics const *me,
-    int num_effective_samples,
-    int num_classes,
-    PerfMetrics &perf_zc) {
-  PerfMetrics *perf;
-  checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics)));
-  checkCUDA(
-      cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice));
+void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr,
+                                                int const *label_ptr,
+                                                MetricsAttrs const *me,
+                                                int num_effective_samples,
+                                                int num_classes,
+                                                PerfMetrics &perf_zc) {
+  CUDAPerfMetrics perf(perf_zc);
+  CUDAPerfMetrics *perf_cuda;
+  checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics)));
+  checkCUDA(cudaMemcpy(
+      perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice));
 
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
@@ -154,32 +179,33 @@ void Metrics::update_metrics_sparse_label_kernel_wrapper(
                                        CUDA_NUM_THREADS,
                                        0,
                                        stream>>>(
-      logit_ptr, label_ptr, perf, *me, num_effective_samples, num_classes);
+      logit_ptr, label_ptr, perf_cuda, *me, num_effective_samples, num_classes);
   checkCUDA(cudaStreamSynchronize(stream));
-  checkCUDA(
-      cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost));
-  checkCUDA(cudaFree(perf));
+  checkCUDA(cudaMemcpy(
+      &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost));
+  checkCUDA(cudaFree(perf_cuda));
 }
 
-void Metrics::update_metrics_label_kernel_wrapper(float const *logit_ptr,
-                                                  float const *label_ptr,
-                                                  Metrics const *me,
-                                                  int num_samples,
-                                                  int num_classes,
-                                                  PerfMetrics &perf_zc) {
-  PerfMetrics *perf;
-  checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics)));
-  checkCUDA(
-      cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice));
+void update_metrics_label_kernel_wrapper(float const *logit_ptr,
+                                         float const *label_ptr,
+                                         MetricsAttrs const *me,
+                                         int num_samples,
+                                         int num_classes,
+                                         PerfMetrics &perf_zc) {
+  CUDAPerfMetrics perf(perf_zc);
+  CUDAPerfMetrics *perf_cuda;
+  checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics)));
+  checkCUDA(cudaMemcpy(
+      perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice));
 
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   update_metrics_label_kernel<<<GET_BLOCKS(num_samples), 256, 0, stream>>>(
-      logit_ptr, label_ptr, perf, *me, num_samples, num_classes);
+      logit_ptr, label_ptr, perf_cuda, *me, num_samples, num_classes);
   checkCUDA(cudaStreamSynchronize(stream));
-  checkCUDA(
-      cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost));
-  checkCUDA(cudaFree(perf));
+  checkCUDA(cudaMemcpy(
+      &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost));
+  checkCUDA(cudaFree(perf_cuda));
 }
 
 }; // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
index 6c6e17a181..512981e32b 100644
--- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
+++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
@@ -53,9 +53,9 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      BatchNormPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *output_grad_ptr,
                      float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
                      float *input_grad_ptr,
                      float const *scale_ptr,
                      float *scale_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu
index b895ffb68f..afc3e1f7ef 100644
--- a/lib/kernels/src/cuda/ops/cast_kernels.cu
+++ b/lib/kernels/src/cuda/ops/cast_kernels.cu
@@ -50,30 +50,26 @@ struct ForwardKernel {
 template <DataType IDT, DataType ODT>
 struct BackwardKernel {
   void operator()(ffStream_t stream,
-                  GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &output) {
-    size_t volume = input.shape.get_volume();
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
+    size_t volume = output.shape.get_volume();
     cast_backward<<<GET_BLOCKS(volume), CUDA_NUM_THREADS, 0, stream>>>(
-        input.get<IDT>(), output.get<ODT>(), volume, cast_to<ODT>(1.0f));
+        output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
   }
 };
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    DataType input_type,
-                    DataType output_type) {
+                    GenericTensorAccessorW const &output) {
   DataTypeDispatch2<ForwardKernel>{}(
-      input_type, output_type, stream, input, output);
+      input.data_type, output.data_type, stream, input, output);
 }
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output,
-                     DataType input_type,
-                     DataType output_type) {
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
   DataTypeDispatch2<BackwardKernel>{}(
-      input_type, output_type, stream, input, output);
+      output.data_type, input.data_type, stream, output, input);
 }
 
 } // namespace Cast
diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu
index 68004738d2..ad216feda2 100644
--- a/lib/kernels/src/cuda/ops/concat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/concat_kernels.cu
@@ -17,9 +17,7 @@
 #include "kernels/concat_kernels.h"
 #include <cassert>
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Concat {
+namespace FlexFlow::Kernels::Concat {
 
 void calc_blk_size(size_t &num_blocks,
                    size_t &blk_size,
@@ -87,6 +85,4 @@ void backward_kernel(cudaStream_t stream,
   }
 }
 
-} // namespace Concat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Concat
diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
index e3a4c97a31..0a4024ba8a 100644
--- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
+++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
@@ -313,10 +313,10 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      Conv2DPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
                      float const *output_ptr,
                      float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
                      float const *filter_ptr,
                      float *filter_grad_ptr,
                      float *bias_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
index a35d28fa8c..687a9fa220 100644
--- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu
+++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
@@ -290,10 +290,10 @@ struct BackwardKernel {
                   OperatorType op_type,
                   std::optional<float> scalar,
                   PerDeviceFFHandle const &handle,
-                  GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &input_grad,
                   GenericTensorAccessorR const &output,
-                  GenericTensorAccessorR const &output_grad) {
+                  GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &input_grad) {
     checkCUDNN(cudnnSetStream(handle.dnn, stream));
 
     if (use_cudnn(op_type)) {
@@ -356,20 +356,20 @@ void backward_kernel(ffStream_t stream,
                      ElementUnaryPerDeviceState const &device_state,
                      ElementUnaryAttrs const &attrs,
                      PerDeviceFFHandle const &handle,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &input_grad,
                      GenericTensorAccessorR const &output,
-                     GenericTensorAccessorR const &output_grad) {
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorR const &input,
+                     GenericTensorAccessorW const &input_grad) {
   DataTypeDispatch1<BackwardKernel>{}(input.data_type,
                                       stream,
                                       device_state,
                                       get_op_type(attrs),
                                       attrs.scalar,
                                       handle,
-                                      input,
-                                      input_grad,
                                       output,
-                                      output_grad);
+                                      output_grad,
+                                      input,
+                                      input_grad);
 }
 
 } // namespace ElementUnary
diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu
index 941db108a0..f661e5fb0a 100644
--- a/lib/kernels/src/cuda/ops/flat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/flat_kernels.cu
@@ -34,8 +34,8 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      GenericTensorAccessorR input,
-                     float *input_grad_ptr,
-                     float const *output_grad_ptr) {
+                     float const *output_grad_ptr,
+                     float *input_grad_ptr) {
 
   float alpha = 1.0f;
   apply_add_with_scale<float>
diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu
index ca51f0d216..0d5a772918 100644
--- a/lib/kernels/src/cuda/ops/linear_kernels.cu
+++ b/lib/kernels/src/cuda/ops/linear_kernels.cu
@@ -108,10 +108,10 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle,
 
 void forward_kernel(cudaStream_t stream,
                     LinearPerDeviceState const &m,
-                    void const *input_ptr,
-                    void *output_ptr,
-                    void const *weight_ptr,
-                    void const *bias_ptr,
+                    float const *input_ptr,
+                    float *output_ptr,
+                    float const *weight_ptr,
+                    float const *bias_ptr,
                     int in_dim,
                     int out_dim,
                     int batch_size) {
@@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream,
                            batch_size,
                            in_dim,
                            &alpha,
-                           weight_ptr,
+                           static_cast<void const *>(weight_ptr),
                            weight_type,
                            in_dim,
-                           input_ptr,
+                           static_cast<void const *>(input_ptr),
                            input_type,
                            in_dim,
                            &beta,
-                           output_ptr,
+                           static_cast<void *>(output_ptr),
                            output_type,
                            out_dim,
                            compute_type,
@@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream,
                              batch_size,
                              1,
                              &alpha,
-                             bias_ptr,
+                             static_cast<void const *>(bias_ptr),
                              weight_type,
                              1,
-                             m.one_ptr,
+                             static_cast<void const *>(m.one_ptr),
                              CUDA_R_32F,
                              1,
                              &alpha,
-                             output_ptr,
+                             static_cast<void *>(output_ptr),
                              output_type,
                              out_dim,
                              compute_type,
@@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream,
                                       m.actiDesc,
                                       &alpha,
                                       m.outputTensor,
-                                      output_ptr,
+                                      static_cast<void *>(output_ptr),
                                       &beta,
                                       m.outputTensor,
-                                      output_ptr));
+                                      static_cast<void *>(output_ptr)));
   } else if (m.activation == Activation::GELU) {
     size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size);
     constexpr float B = 0.7978845608028654f;   // sqrt(2.0/M_PI)
@@ -191,13 +191,13 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      LinearPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
-                     void const *output_ptr,
-                     void *output_grad_ptr,
-                     void const *kernel_ptr,
-                     void *kernel_grad_ptr,
-                     void *bias_grad_ptr,
+                     float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
+                     float const *kernel_ptr,
+                     float *kernel_grad_ptr,
+                     float *bias_grad_ptr,
                      int in_dim,
                      int out_dim,
                      int batch_size) {
@@ -216,11 +216,17 @@ void backward_kernel(cudaStream_t stream,
   int output_size = out_dim * batch_size;
   if (m.activation.has_value()) {
     if (m.activation == Activation::RELU) {
-      relu_backward_kernel(
-          m.output_type, output_grad_ptr, output_ptr, output_size, stream);
+      relu_backward_kernel(m.output_type,
+                           static_cast<void *>(output_grad_ptr),
+                           static_cast<void const *>(output_ptr),
+                           output_size,
+                           stream);
     } else if (m.activation == Activation::SIGMOID) {
-      sigmoid_backward_kernel(
-          m.output_type, output_grad_ptr, output_ptr, output_size, stream);
+      sigmoid_backward_kernel(m.output_type,
+                              static_cast<void *>(output_grad_ptr),
+                              static_cast<void const *>(output_ptr),
+                              output_size,
+                              stream);
     } else {
       // TODO: only support relu and sigmoid for now
       assert(false && "Unsupported activation for Linear");
@@ -235,14 +241,14 @@ void backward_kernel(cudaStream_t stream,
                            out_dim,
                            batch_size,
                            &alpha,
-                           input_ptr,
+                           static_cast<void const *>(input_ptr),
                            input_type,
                            in_dim,
-                           output_grad_ptr,
+                           static_cast<void *>(output_grad_ptr),
                            output_type,
                            out_dim,
                            &alpha,
-                           kernel_grad_ptr,
+                           static_cast<void *>(kernel_grad_ptr),
                            weight_type,
                            in_dim,
                            compute_type,
@@ -261,12 +267,12 @@ void backward_kernel(cudaStream_t stream,
                               in_dim,
                               out_dim,
                               &alpha,
-                              (float *)kernel_grad_ptr,
+                              kernel_grad_ptr,
                               in_dim,
                               &lambda,
-                              (float *)kernel_ptr,
+                              kernel_ptr,
                               in_dim,
-                              (float *)kernel_grad_ptr,
+                              kernel_grad_ptr,
                               in_dim));
     } else {
       assert(false && "Only L2 regularization is supported");
@@ -284,14 +290,14 @@ void backward_kernel(cudaStream_t stream,
                              out_dim,
                              batch_size,
                              &alpha,
-                             m.one_ptr,
+                             static_cast<void const *>(m.one_ptr),
                              CUDA_R_32F,
                              1,
-                             output_grad_ptr,
+                             static_cast<void *>(output_grad_ptr),
                              output_type,
                              out_dim,
                              &alpha,
-                             bias_grad_ptr,
+                             static_cast<void *>(bias_grad_ptr),
                              weight_type,
                              1,
                              compute_type,
@@ -307,14 +313,14 @@ void backward_kernel(cudaStream_t stream,
                              batch_size,
                              out_dim,
                              &alpha,
-                             kernel_ptr,
+                             static_cast<void const *>(kernel_ptr),
                              weight_type,
                              in_dim,
-                             output_grad_ptr,
+                             static_cast<void *>(output_grad_ptr),
                              output_type,
                              out_dim,
                              &alpha,
-                             input_grad_ptr,
+                             static_cast<void *>(input_grad_ptr),
                              input_type,
                              in_dim,
                              compute_type,
diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu
index 1d07efb5fa..3687c1cedf 100644
--- a/lib/kernels/src/cuda/ops/partition_kernels.cu
+++ b/lib/kernels/src/cuda/ops/partition_kernels.cu
@@ -39,8 +39,8 @@ template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
                   RepartitionPerDeviceState const &m,
-                  GenericTensorAccessorW const &input_grad,
-                  GenericTensorAccessorR const &output_grad) {
+                  GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorW const &input_grad) {
     add_kernel<real_type_t<T>><<<GET_BLOCKS(input_grad.shape.num_elements()),
                                  CUDA_NUM_THREADS,
                                  0,
@@ -65,10 +65,10 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      RepartitionPerDeviceState const &m,
-                     GenericTensorAccessorW const &input_grad,
-                     GenericTensorAccessorR const &output_grad) {
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorW const &input_grad) {
   DataTypeDispatch1<BackwardKernel>{}(
-      m.data_type, stream, m, input_grad, output_grad);
+      m.data_type, stream, m, output_grad, input_grad);
 }
 
 } // namespace Repartition
diff --git a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
index 51fa29d289..f8b35ec885 100644
--- a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
+++ b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
@@ -112,10 +112,10 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      Pool2DPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
                      void const *output_ptr,
-                     void const *output_grad_ptr) {
+                     void const *output_grad_ptr,
+                     void const *input_ptr,
+                     void *input_grad_ptr) {
 
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
 
diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu
index 0c6ba7d8e3..9c3e8dcc40 100644
--- a/lib/kernels/src/cuda/ops/reduction_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu
@@ -54,8 +54,8 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  GenericTensorAccessorW const &input,
-                  GenericTensorAccessorR const &output) {
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
     checkCUDA(cudaMemcpyAsync(input.get<T>(),
                               output.get<T>(),
                               input.shape.num_elements() * size_of_datatype(T),
@@ -73,9 +73,9 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output) {
-  DataTypeDispatch1<BackwardKernel>{}(input.data_type, stream, input, output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
+  DataTypeDispatch1<BackwardKernel>{}(output.data_type, stream, output, input);
 }
 
 } // namespace Reduction
diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu
index 76bfbe2658..1aa61375f0 100644
--- a/lib/kernels/src/cuda/ops/replicate_kernels.cu
+++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu
@@ -50,8 +50,8 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  GenericTensorAccessorW const &input,
                   GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input,
                   size_t num_replicas) {
     size_t total_elements = input.shape.num_elements() * num_replicas;
     replicate_backward_kernel<real_type_t<T>>
@@ -70,11 +70,11 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input,
                      size_t num_replicas) {
   DataTypeDispatch1<BackwardKernel>{}(
-      input.data_type, stream, input, output, num_replicas);
+      input.data_type, stream, output, input, num_replicas);
 }
 
 } // namespace Replicate
diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu
index 5b7843a3a5..b7a328ca08 100644
--- a/lib/kernels/src/cuda/ops/reshape_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu
@@ -42,8 +42,8 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  GenericTensorAccessorW const &input,
-                  GenericTensorAccessorR const &output) {
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
     float alpha = 1.0f;
     apply_add_with_scale<real_type_t<T>>
         <<<GET_BLOCKS(input.shape.num_elements()),
@@ -65,9 +65,9 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      ReshapePerDeviceState const &m,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output) {
-  DataTypeDispatch1<BackwardKernel>{}(m.data_type, stream, input, output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
+  DataTypeDispatch1<BackwardKernel>{}(m.data_type, stream, output, input);
 }
 
 } // namespace Reshape
diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu
index 8391a499df..2c25293c36 100644
--- a/lib/kernels/src/cuda/ops/reverse_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu
@@ -17,7 +17,6 @@
 #include "kernels/reverse_kernels.h"
 
 namespace FlexFlow {
-
 namespace Kernels {
 namespace Reverse {
 
diff --git a/lib/kernels/src/cuda/ops/softmax_kernels.cu b/lib/kernels/src/cuda/ops/softmax_kernels.cu
index 93ed85de18..d2498d08a4 100644
--- a/lib/kernels/src/cuda/ops/softmax_kernels.cu
+++ b/lib/kernels/src/cuda/ops/softmax_kernels.cu
@@ -61,8 +61,8 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     float *input_grad_ptr,
                      float const *output_grad_ptr,
+                     float *input_grad_ptr,
                      size_t num_elements) {
 
   checkCUDA(cudaMemcpyAsync(input_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu
index 3b3f80944d..37e1a08326 100644
--- a/lib/kernels/src/cuda/ops/transpose_kernels.cu
+++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu
@@ -91,8 +91,8 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      TransposePerDeviceState const &m,
-                     GenericTensorAccessorW const &in_grad,
-                     GenericTensorAccessorR const &out_grad) {
+                     GenericTensorAccessorR const &out_grad,
+                     GenericTensorAccessorW const &in_grad) {
 
   TransposeStrides info;
   info.num_dim = in_grad.shape.num_dims();
diff --git a/lib/kernels/src/cuda/optimizer_kernel.cu b/lib/kernels/src/cuda/optimizer_kernels.cu
similarity index 81%
rename from lib/kernels/src/cuda/optimizer_kernel.cu
rename to lib/kernels/src/cuda/optimizer_kernels.cu
index 439eed9dec..1c6954a0b0 100644
--- a/lib/kernels/src/cuda/optimizer_kernel.cu
+++ b/lib/kernels/src/cuda/optimizer_kernels.cu
@@ -13,7 +13,9 @@
  * limitations under the License.
  */
 
+#include "device.h"
 #include "kernels/optimizer_kernels.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
@@ -80,13 +82,25 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
   // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr);
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
-                          size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
-                          stream));
+
+  auto const &state = meta->raw_variant;
+  ncclComm_t comm = std::visit(
+      [](auto const &s) -> ncclComm_t {
+        using T = std::decay_t<decltype(s)>;
+        if constexpr (std::is_same_v<T, FlexFlow::ElementUnaryPerDeviceState> ||
+                      std::is_same_v<T, FlexFlow::ReshapePerDeviceState> ||
+                      std::is_same_v<T, FlexFlow::TopKPerDeviceState> ||
+                      std::is_same_v<T, FlexFlow::TransposePerDeviceState>) {
+          throw mk_runtime_error("State type does not support NCCL operations");
+        } else {
+          return s.handle.ncclComm;
+        }
+      },
+      state);
+
+  checkNCCL(ncclAllReduce(
+      w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream));
+
   // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
   // print_tensor<float>((float*)w_grad_ptr, 16, "[After ncclAllReduce]");
 
@@ -157,7 +171,7 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
   for (int i = 1; i < num_replicas; i++) {
     float const *src = w_grad_ptr + i * size;
     add_kernel<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-        size, 1.0f, src, (float *)w_grad_ptr);
+        (float *)w_grad_ptr, src, size);
   }
   // checkCUDA(cudaDeviceSynchronize());
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
@@ -188,13 +202,24 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
   // Use NCCL to sync gradients
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
-                          size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
-                          stream));
+
+  auto const &state = meta->raw_variant;
+  ncclComm_t comm = std::visit(
+      [](auto const &s) -> ncclComm_t {
+        using T = std::decay_t<decltype(s)>;
+        if constexpr (std::is_same_v<T, FlexFlow::ElementUnaryPerDeviceState> ||
+                      std::is_same_v<T, FlexFlow::ReshapePerDeviceState> ||
+                      std::is_same_v<T, FlexFlow::TopKPerDeviceState> ||
+                      std::is_same_v<T, FlexFlow::TransposePerDeviceState>) {
+          throw mk_runtime_error("State type does not support NCCL operations");
+        } else {
+          return s.handle.ncclComm;
+        }
+      },
+      state);
+
+  checkNCCL(ncclAllReduce(
+      w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream));
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
   //         op->alpha, op->alpha_t, op->weight_decay);
   //  Step 2: Adam update
diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc
new file mode 100644
index 0000000000..5cf337c685
--- /dev/null
+++ b/lib/kernels/src/local_cpu_allocator.cc
@@ -0,0 +1,30 @@
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/device.h"
+#include "utils/containers/contains_key.h"
+
+namespace FlexFlow {
+void *LocalCPUAllocator::allocate(size_t requested_memory_size) {
+  void *ptr = malloc(requested_memory_size);
+  this->ptrs.insert({ptr, std::unique_ptr<void, decltype(&free)>(ptr, free)});
+  return ptr;
+}
+
+void LocalCPUAllocator::deallocate(void *ptr) {
+  if (contains_key(this->ptrs, ptr)) {
+    this->ptrs.erase(ptr);
+  } else {
+    throw std::runtime_error(
+        "Deallocating a pointer that was not allocated by this Allocator");
+  }
+}
+
+DeviceType LocalCPUAllocator::get_allocation_device_type() const {
+  return DeviceType::CPU;
+}
+
+Allocator create_local_cpu_memory_allocator() {
+  Allocator allocator = Allocator::create<LocalCPUAllocator>();
+  return allocator;
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc
index cdcfb017a0..416768a479 100644
--- a/lib/kernels/src/local_cuda_allocator.cc
+++ b/lib/kernels/src/local_cuda_allocator.cc
@@ -20,6 +20,10 @@ void LocalCudaAllocator::deallocate(void *ptr) {
   }
 }
 
+DeviceType LocalCudaAllocator::get_allocation_device_type() const {
+  return DeviceType::GPU;
+}
+
 LocalCudaAllocator::~LocalCudaAllocator() {
   for (void *ptr : this->ptrs) {
     checkCUDA(cudaFree(ptr));
@@ -27,7 +31,8 @@ LocalCudaAllocator::~LocalCudaAllocator() {
 }
 
 Allocator create_local_cuda_memory_allocator() {
-  return Allocator::create<LocalCudaAllocator>();
+  Allocator allocator = Allocator::create<LocalCudaAllocator>();
+  return allocator;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/managed_ff_stream.cc b/lib/kernels/src/managed_ff_stream.cc
index 7385b6cc3e..f0348aa91c 100644
--- a/lib/kernels/src/managed_ff_stream.cc
+++ b/lib/kernels/src/managed_ff_stream.cc
@@ -1,28 +1,36 @@
 #include "kernels/managed_ff_stream.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
 ManagedFFStream::ManagedFFStream() : stream(new ffStream_t) {
-  checkCUDA(cudaStreamCreate(stream));
+  checkCUDA(cudaStreamCreate(this->stream));
 }
 
 ManagedFFStream::ManagedFFStream(ManagedFFStream &&other) noexcept
     : stream(std::exchange(other.stream, nullptr)) {}
 
 ManagedFFStream &ManagedFFStream::operator=(ManagedFFStream &&other) noexcept {
-  std::swap(this->stream, other.stream);
+  if (this != &other) {
+    this->cleanup();
+    this->stream = std::exchange(other.stream, nullptr);
+  }
   return *this;
 }
 
 ManagedFFStream::~ManagedFFStream() {
-  if (stream != nullptr) {
-    checkCUDA(cudaStreamDestroy(*stream));
-    delete stream;
+  this->cleanup();
+}
+
+void ManagedFFStream::cleanup() {
+  if (this->stream != nullptr) {
+    checkCUDA(cudaStreamDestroy(*this->stream));
+    delete this->stream;
   }
 }
 
 ffStream_t const &ManagedFFStream::raw_stream() const {
-  return *stream;
+  return *this->stream;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc
index c050e887b6..9f1737240e 100644
--- a/lib/kernels/src/managed_per_device_ff_handle.cc
+++ b/lib/kernels/src/managed_per_device_ff_handle.cc
@@ -3,14 +3,15 @@
 
 namespace FlexFlow {
 
-ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() {
-  handle = new PerDeviceFFHandle;
-  handle->workSpaceSize = 1024 * 1024;
-  handle->allowTensorOpMathConversion = true;
-
-  checkCUDNN(cudnnCreate(&handle->dnn));
-  checkCUBLAS(cublasCreate(&handle->blas));
-  checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize));
+ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
+    size_t workSpaceSize, bool allowTensorOpMathConversion) {
+  this->handle = new PerDeviceFFHandle{};
+  this->handle->workSpaceSize = workSpaceSize;
+  this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion;
+
+  checkCUDNN(cudnnCreate(&this->handle->dnn));
+  checkCUBLAS(cublasCreate(&this->handle->blas));
+  checkCUDA(cudaMalloc(&this->handle->workSpace, this->handle->workSpaceSize));
 }
 
 ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
@@ -19,16 +20,23 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
 
 ManagedPerDeviceFFHandle &ManagedPerDeviceFFHandle::operator=(
     ManagedPerDeviceFFHandle &&other) noexcept {
-  std::swap(this->handle, other.handle);
+  if (this != &other) {
+    this->cleanup();
+    this->handle = std::exchange(other.handle, nullptr);
+  }
   return *this;
 }
 
 ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() {
-  if (handle != nullptr) {
-    checkCUDNN(cudnnDestroy(handle->dnn));
-    checkCUBLAS(cublasDestroy(handle->blas));
-    checkCUDA(cudaFree(handle->workSpace));
-    delete handle;
+  this->cleanup();
+}
+
+void ManagedPerDeviceFFHandle::cleanup() {
+  if (this->handle != nullptr) {
+    checkCUDNN(cudnnDestroy(this->handle->dnn));
+    checkCUBLAS(cublasDestroy(this->handle->blas));
+    checkCUDA(cudaFree(this->handle->workSpace));
+    delete this->handle;
   }
 }
 
diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt
index 007740b510..981f87b3d8 100644
--- a/lib/kernels/test/CMakeLists.txt
+++ b/lib/kernels/test/CMakeLists.txt
@@ -14,4 +14,5 @@ ff_add_test_executable(
     cudnn
     cudart
     cublas
+    pcg
 )
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index d44129ece1..023233ecb0 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -13,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     size_t qoSeqLength = 20, kvSeqLength = 20;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -33,16 +35,16 @@ TEST_SUITE(FF_TEST_SUITE) {
                                                  kvSeqLength,
                                                  false);
 
-    TensorShape query_shape = make_float_tensor_shape_from_legion_dims(
-        {qoSeqLength, num_samples, qSize});
-    TensorShape key_shape = make_float_tensor_shape_from_legion_dims(
-        {kvSeqLength, num_samples, kSize});
-    TensorShape value_shape = make_float_tensor_shape_from_legion_dims(
-        {kvSeqLength, num_samples, vSize});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
-        {qoSeqLength, num_samples, oProjSize});
+    TensorShape query_shape = make_tensor_shape_from_legion_dims(
+        {qoSeqLength, num_samples, qSize}, DataType::FLOAT);
+    TensorShape key_shape = make_tensor_shape_from_legion_dims(
+        {kvSeqLength, num_samples, kSize}, DataType::FLOAT);
+    TensorShape value_shape = make_tensor_shape_from_legion_dims(
+        {kvSeqLength, num_samples, vSize}, DataType::FLOAT);
+    TensorShape output_shape = make_tensor_shape_from_legion_dims(
+        {qoSeqLength, num_samples, oProjSize}, DataType::FLOAT);
     TensorShape weight_shape =
-        make_float_tensor_shape_from_legion_dims({state.weightSize});
+        make_tensor_shape_from_legion_dims({state.weightSize}, DataType::FLOAT);
 
     GenericTensorAccessorW query_accessor =
         create_random_filled_accessor_w(query_shape, allocator);
@@ -66,9 +68,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           weight_accessor.get_float_ptr(),
           output_accessor.get_float_ptr());
 
-      std::vector<float> host_output = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index 18e6977148..8a11a069f5 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -15,16 +15,18 @@ TEST_SUITE(FF_TEST_SUITE) {
     size_t seq_length = -1;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape_a =
-        make_float_tensor_shape_from_legion_dims({m, k, batch});
+        make_tensor_shape_from_legion_dims({m, k, batch}, DataType::FLOAT);
     TensorShape input_shape_b =
-        make_float_tensor_shape_from_legion_dims({k, n, batch});
+        make_tensor_shape_from_legion_dims({k, n, batch}, DataType::FLOAT);
     TensorShape output_shape =
-        make_float_tensor_shape_from_legion_dims({m, n, batch});
+        make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT);
 
     GenericTensorAccessorW a_accessor =
         create_random_filled_accessor_w(input_shape_a, allocator);
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index 8487bbda6a..03a3a1ad40 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -1,5 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/batch_norm_kernels.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
@@ -9,7 +10,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     size_t output_n = 1, output_c = 10, output_h = 10, output_w = 10;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -23,25 +26,25 @@ TEST_SUITE(FF_TEST_SUITE) {
                                         output_w,
                                         true);
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
-    TensorShape scale_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
-    TensorShape bias_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
+    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        {output_n, output_c, output_h, output_w}, DataType::FLOAT);
+    TensorShape output_shape = make_tensor_shape_from_legion_dims(
+        {output_n, output_c, output_h, output_w}, DataType::FLOAT);
+    TensorShape scale_shape = make_tensor_shape_from_legion_dims(
+        {output_n, output_c, output_h, output_w}, DataType::FLOAT);
+    TensorShape bias_shape = make_tensor_shape_from_legion_dims(
+        {output_n, output_c, output_h, output_w}, DataType::FLOAT);
 
     GenericTensorAccessorW input_accessor =
         create_random_filled_accessor_w(input_shape, allocator);
     GenericTensorAccessorW output_accessor =
         create_random_filled_accessor_w(output_shape, allocator);
-    GenericTensorAccessorW scale_accessor =
-        create_filled_accessor_w(scale_shape, allocator, 1.0f);
+    GenericTensorAccessorW scale_accessor = create_filled_accessor_w(
+        scale_shape, allocator, make_float_data_type_value(1));
 
     SUBCASE("forward_kernel") {
-      GenericTensorAccessorW bias_accessor =
-          create_filled_accessor_w(bias_shape, allocator, 0.0f);
+      GenericTensorAccessorW bias_accessor = create_filled_accessor_w(
+          bias_shape, allocator, make_float_data_type_value(0));
 
       Kernels::BatchNorm::forward_kernel(managed_stream.raw_stream(),
                                          state,
@@ -50,10 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                          scale_accessor.get_float_ptr(),
                                          bias_accessor.get_float_ptr());
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
@@ -68,28 +68,18 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(),
                                           state,
-                                          input_accessor.get_float_ptr(),
-                                          output_grad_accessor.get_float_ptr(),
                                           output_accessor.get_float_ptr(),
+                                          output_grad_accessor.get_float_ptr(),
+                                          input_accessor.get_float_ptr(),
                                           input_grad_accessor.get_float_ptr(),
                                           scale_accessor.get_float_ptr(),
                                           scale_grad_accessor.get_float_ptr(),
                                           bias_grad_accessor.get_float_ptr(),
                                           input_accessor.shape.num_elements());
 
-      std::vector<float> host_input_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      std::vector<float> host_scale_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(scale_grad_accessor));
-      std::vector<float> host_bias_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(bias_grad_accessor));
-
-      CHECK(contains_non_zero(host_input_grad_data));
-      CHECK(contains_non_zero(host_scale_grad_data));
-      CHECK(contains_non_zero(host_bias_grad_data));
+      CHECK(contains_non_zero(input_grad_accessor));
+      CHECK(contains_non_zero(scale_grad_accessor));
+      CHECK(contains_non_zero(bias_grad_accessor));
     }
 
     Kernels::BatchNorm::cleanup_kernel(allocator,
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index b110208bce..1be5839a9c 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -1,7 +1,7 @@
 #include "doctest/doctest.h"
 #include "kernels/cast_kernels.h"
+#include "kernels/cast_kernels_cpu.h"
 #include "test_utils.h"
-#include <type_traits>
 
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
@@ -11,46 +11,68 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({100, 100});
+        make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT);
     TensorShape output_shape =
-        make_double_tensor_shape_from_legion_dims({100, 100});
-
-    GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w(output_shape, allocator);
+        make_tensor_shape_from_legion_dims({100, 100}, DataType::DOUBLE);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
-
-      Kernels::Cast::forward_kernel(managed_stream.raw_stream(),
-                                    input_accessor,
-                                    output_accessor,
-                                    DataType::FLOAT,
-                                    DataType::DOUBLE);
+          create_random_filled_accessor_r(input_shape, allocator);
+      GenericTensorAccessorW output_accessor =
+          allocator.allocate_tensor(output_shape);
 
-      std::vector<double> host_double_data =
-          load_data_to_host_from_device<double>(
-              read_only_accessor_from_write_accessor(output_accessor));
+      Kernels::Cast::forward_kernel(
+          managed_stream.raw_stream(), input_accessor, output_accessor);
 
-      CHECK(contains_non_zero(host_double_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
+      GenericTensorAccessorR grad_output_accessor =
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW grad_input_accessor =
-          allocator.allocate_tensor(input_shape);
-
-      Kernels::Cast::backward_kernel(
-          managed_stream.raw_stream(),
-          read_only_accessor_from_write_accessor(output_accessor),
-          grad_input_accessor,
-          DataType::DOUBLE,
-          DataType::FLOAT);
-
-      std::vector<float> host_grad_float_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(grad_input_accessor));
-      CHECK(contains_non_zero(host_grad_float_data));
+          create_zero_filled_accessor_w(input_shape, allocator);
+
+      Kernels::Cast::backward_kernel(managed_stream.raw_stream(),
+                                     grad_output_accessor,
+                                     grad_input_accessor);
+
+      CHECK(contains_non_zero(grad_input_accessor));
+    }
+  }
+
+  TEST_CASE("Check Cast Forward Kernel against CPU Kernel") {
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({10, 2}, DataType::FLOAT);
+    TensorShape output_shape =
+        make_tensor_shape_from_legion_dims({10, 2}, DataType::DOUBLE);
+
+    // Only calling forward kernel as backward kernel is exactly the same
+    SUBCASE("forward_kernel") {
+      // Run GPU Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          create_zero_filled_accessor_w(output_shape, gpu_allocator);
+
+      Kernels::Cast::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      // Run CPU Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          create_zero_filled_accessor_w(output_shape, cpu_allocator);
+
+      Kernels::Cast::cpu_forward_kernel(input_accessor_cpu,
+                                        output_accessor_cpu);
+
+      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index 2e1000cb95..a4688a1030 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -1,39 +1,37 @@
 #include "doctest/doctest.h"
 #include "kernels/combine_kernels.h"
+#include "kernels/combine_kernels_cpu.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Test combine kernel") {
-    ManagedPerDeviceFFHandle managed_handle{};
+  TEST_CASE("Call Combine Forward and Backward Kernels") {
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({100, 100});
+        make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Combine::forward_kernel(
           managed_stream.raw_stream(), input_accessor, output_accessor);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
@@ -41,9 +39,64 @@ TEST_SUITE(FF_TEST_SUITE) {
                                         output_grad_accessor,
                                         input_grad_accessor);
 
-      std::vector<float> host_input_grad = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_input_grad));
+      CHECK(contains_non_zero(input_grad_accessor));
+    }
+  }
+
+  TEST_CASE("Check Combine Forward Kernel against CPU Kernel") {
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({5, 5}, DataType::FLOAT);
+    TensorShape output_shape = input_shape;
+
+    SUBCASE("forward_kernel") {
+      // Run GPU Combine Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          gpu_allocator.allocate_tensor(output_shape);
+
+      Kernels::Combine::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      // Run CPU Combine Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          cpu_allocator.allocate_tensor(output_shape);
+
+      Kernels::Combine::cpu_forward_kernel(input_accessor_cpu,
+                                           output_accessor_cpu);
+
+      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Combine Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r(output_shape, gpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          create_zero_filled_accessor_w(input_shape, gpu_allocator);
+
+      Kernels::Combine::backward_kernel(managed_stream.raw_stream(),
+                                        output_grad_accessor_gpu,
+                                        input_grad_accessor_gpu);
+
+      // Run CPU Combine Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          create_zero_filled_accessor_w(input_shape, cpu_allocator);
+
+      Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu,
+                                            input_grad_accessor_cpu);
+
+      CHECK(accessors_are_equal(input_grad_accessor_gpu,
+                                input_grad_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index bf2a521b4e..b299f5dea8 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -1,29 +1,31 @@
 #include "doctest/doctest.h"
 #include "kernels/concat_kernels.h"
 #include "test_utils.h"
+#include "utils/containers/repeat.h"
 
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test concat kernel forward and backward") {
-    size_t num_inputs = 3;
-    size_t size_per_input = 100;
-    ff_dim_t concat_axis = ff_dim_t(0);
+    size_t num_inputs = 2;
+    size_t size_per_input = 10;
+    ff_dim_t concat_axis = ff_dim_t{1};
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({size_per_input});
-    TensorShape output_shape =
-        make_float_tensor_shape_from_legion_dims({size_per_input, num_inputs});
+        make_tensor_shape_from_legion_dims({size_per_input}, DataType::FLOAT);
+    TensorShape output_shape = make_tensor_shape_from_legion_dims(
+        {num_inputs, size_per_input}, DataType::FLOAT);
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     SUBCASE("forward_kernel") {
       std::vector<GenericTensorAccessorR> input_accessors =
           repeat(num_inputs, [&]() {
-            return read_only_accessor_from_write_accessor(
-                create_random_filled_accessor_w(input_shape, allocator));
+            return create_random_filled_accessor_r(input_shape, allocator);
           });
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
@@ -33,17 +35,12 @@ TEST_SUITE(FF_TEST_SUITE) {
                                       input_accessors,
                                       concat_axis);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       std::vector<GenericTensorAccessorW> input_grad_accessors = repeat(
           num_inputs, [&]() { return allocator.allocate_tensor(input_shape); });
 
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 81f3c7183a..4be2bdf7bb 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -1,6 +1,7 @@
 #include "doctest/doctest.h"
 #include "kernels/dropout_kernels.h"
 #include "test_utils.h"
+#include "utils/containers/count.h"
 
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
@@ -13,11 +14,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10, 10});
+        make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -30,8 +33,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
@@ -40,11 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        input_accessor.get_float_ptr(),
                                        output_accessor.get_float_ptr());
 
-      std::vector<float> host_output_accessor =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      CHECK(contains_non_zero(host_output_accessor));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index 70894858e3..0bb69aa1dc 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -1,5 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/flat_kernels.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
@@ -7,15 +8,18 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Flat Kernel") {
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     GenericTensorAccessorR input_accessor =
-        read_only_accessor_from_write_accessor(
-            create_filled_accessor_w(input_shape, allocator, 2.0f));
+        read_only_accessor_from_write_accessor(create_filled_accessor_w(
+            input_shape, allocator, make_float_data_type_value(2)));
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW output_accessor =
@@ -25,33 +29,21 @@ TEST_SUITE(FF_TEST_SUITE) {
                                     input_accessor,
                                     output_accessor.get_float_ptr());
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements(), 2.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW output_grad_accessor =
-          create_filled_accessor_w(output_shape, allocator, 0.0f);
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 1.0f);
+      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
+          output_shape, allocator, make_float_data_type_value(0));
+      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
+          input_shape, allocator, make_float_data_type_value(1));
 
       Kernels::Flat::backward_kernel(managed_stream.raw_stream(),
                                      input_accessor,
-                                     input_grad_accessor.get_float_ptr(),
-                                     output_grad_accessor.get_float_ptr());
-
-      std::vector<float> backward_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
+                                     output_grad_accessor.get_float_ptr(),
+                                     input_grad_accessor.get_float_ptr());
 
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements(), 1.0f);
-      CHECK(backward_output_data == expected_output_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 88ac2f6889..7f97563217 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -5,24 +5,26 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Gather Forward and Backward Kernel") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     GatherPerDeviceState state = {managed_handle.raw_handle(), legion_dim_t(2)};
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50});
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
+    TensorShape output_shape =
+        make_tensor_shape_from_legion_dims({50}, DataType::FLOAT);
 
     GenericTensorAccessorR index_accessor =
-        read_only_accessor_from_write_accessor(
-            create_random_filled_accessor_w(output_shape, allocator));
+        create_random_filled_accessor_r(output_shape, allocator);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
@@ -32,16 +34,12 @@ TEST_SUITE(FF_TEST_SUITE) {
                                       index_accessor,
                                       output_accessor);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
 
@@ -51,10 +49,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        index_accessor,
                                        input_grad_accessor);
 
-      std::vector<float> host_input_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_input_grad_data));
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 03b2f56bb9..7d7298f83d 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -1,5 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/layer_norm_kernels.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
@@ -11,13 +12,15 @@ TEST_SUITE(FF_TEST_SUITE) {
     float epsilon = 1e-5f;
     bool elementwise_affine = true;
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({batch_size, feature_size});
+    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        {batch_size, feature_size}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
     TensorShape feature_shape =
-        make_float_tensor_shape_from_legion_dims({feature_size});
+        make_tensor_shape_from_legion_dims({feature_size}, DataType::FLOAT);
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -31,16 +34,15 @@ TEST_SUITE(FF_TEST_SUITE) {
                                         epsilon);
 
     GenericTensorAccessorR input_accessor =
-        read_only_accessor_from_write_accessor(
-            create_random_filled_accessor_w(input_shape, allocator));
-    GenericTensorAccessorW gamma_accessor =
-        create_filled_accessor_w(feature_shape, allocator, 1.0f);
+        create_random_filled_accessor_r(input_shape, allocator);
+    GenericTensorAccessorW gamma_accessor = create_filled_accessor_w(
+        feature_shape, allocator, make_float_data_type_value(1));
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
-      GenericTensorAccessorW beta_accessor =
-          create_filled_accessor_w(feature_shape, allocator, 0.0f);
+      GenericTensorAccessorW beta_accessor = create_filled_accessor_w(
+          feature_shape, allocator, make_float_data_type_value(0));
 
       Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(),
                                          state,
@@ -52,8 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
       GenericTensorAccessorW gamma_grad_accessor =
diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
new file mode 100644
index 0000000000..605aa6ffa1
--- /dev/null
+++ b/lib/kernels/test/src/test_managed_ff_stream.cc
@@ -0,0 +1,31 @@
+#include "doctest/doctest.h"
+#include "kernels/managed_ff_stream.h"
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("ManagedFFStream") {
+    ManagedFFStream base_stream{};
+    ffStream_t const *base_stream_ptr = &base_stream.raw_stream();
+
+    SUBCASE("move constructor") {
+      ManagedFFStream new_stream(std::move(base_stream));
+      CHECK(&base_stream.raw_stream() == nullptr);
+      CHECK(&new_stream.raw_stream() == base_stream_ptr);
+    }
+
+    SUBCASE("move assignment operator") {
+      SUBCASE("move assign to other") {
+        ManagedFFStream new_stream{};
+        new_stream = std::move(base_stream);
+        CHECK(&base_stream.raw_stream() == nullptr);
+        CHECK(&new_stream.raw_stream() == base_stream_ptr);
+      }
+
+      SUBCASE("move assign to self") {
+        base_stream = std::move(base_stream);
+        CHECK(&base_stream.raw_stream() == base_stream_ptr);
+      }
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
new file mode 100644
index 0000000000..de3e5b72b1
--- /dev/null
+++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
@@ -0,0 +1,38 @@
+#include "doctest/doctest.h"
+#include "kernels/managed_per_device_ff_handle.h"
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("ManagedPerDeviceFFHandle") {
+    ManagedPerDeviceFFHandle base_handle{1024 * 1024, true};
+    PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle();
+
+    SUBCASE("constructor") {
+      CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024);
+      CHECK(base_handle.raw_handle().allowTensorOpMathConversion == true);
+    }
+
+    SUBCASE("move constructor") {
+      ManagedPerDeviceFFHandle new_handle(std::move(base_handle));
+
+      CHECK(&base_handle.raw_handle() == nullptr);
+      CHECK(&new_handle.raw_handle() == base_handle_ptr);
+    }
+
+    SUBCASE("move assignment operator") {
+      SUBCASE("move assign to other") {
+        ManagedPerDeviceFFHandle new_handle{1024 * 1024, true};
+        new_handle = std::move(base_handle);
+
+        CHECK(&base_handle.raw_handle() == nullptr);
+        CHECK(&new_handle.raw_handle() == base_handle_ptr);
+      }
+
+      SUBCASE("move assign to self") {
+        base_handle = std::move(base_handle);
+        CHECK(&base_handle.raw_handle() == base_handle_ptr);
+      }
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
index 437b37e954..e88c811803 100644
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ b/lib/kernels/test/src/test_partition_kernel.cc
@@ -1,12 +1,15 @@
 #include "doctest/doctest.h"
 #include "kernels/partition_kernels.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Partition Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -15,47 +18,33 @@ TEST_SUITE(FF_TEST_SUITE) {
         managed_handle.raw_handle(), DataType::FLOAT);
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10, 10});
+        make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
-      GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+      GenericTensorAccessorR input_accessor = create_filled_accessor_r(
+          input_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Repartition::forward_kernel(
           managed_stream.raw_stream(), state, input_accessor, output_accessor);
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements(), 1.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 2.0f);
+      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
+          output_shape, allocator, make_float_data_type_value(1));
+      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
+          input_shape, allocator, make_float_data_type_value(2));
 
       Kernels::Repartition::backward_kernel(managed_stream.raw_stream(),
                                             state,
-                                            input_grad_accessor,
-                                            output_grad_accessor);
-
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
+                                            output_grad_accessor,
+                                            input_grad_accessor);
 
-      std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements(), 3.0f);
-      CHECK(host_grad_input_data == expected_grad_input_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index ebb92d39db..00fa968235 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -1,5 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/pool_2d_kernels.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
@@ -12,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     PoolOp pool_type = PoolOp::MAX;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -36,10 +39,10 @@ TEST_SUITE(FF_TEST_SUITE) {
                                      stride_w,
                                      pool_type);
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
-        {input_w, input_h, input_c, input_n});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
-        {output_w, output_h, output_c, output_n});
+    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        {input_w, input_h, input_c, input_n}, DataType::FLOAT);
+    TensorShape output_shape = make_tensor_shape_from_legion_dims(
+        {output_w, output_h, output_c, output_n}, DataType::FLOAT);
 
     GenericTensorAccessorW input_accessor =
         create_random_filled_accessor_w(input_shape, allocator);
@@ -52,28 +55,23 @@ TEST_SUITE(FF_TEST_SUITE) {
                                       input_accessor.ptr,
                                       output_accessor.ptr);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW output_grad_accessor =
-          create_filled_accessor_w(output_shape, allocator, 1.0f);
+      GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w(
+          output_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
       Kernels::Pool2D::backward_kernel(managed_stream.raw_stream(),
                                        state,
-                                       input_accessor.ptr,
-                                       input_grad_accessor.ptr,
                                        output_accessor.ptr,
-                                       output_grad_accessor.ptr);
+                                       output_grad_accessor.ptr,
+                                       input_accessor.ptr,
+                                       input_grad_accessor.ptr);
 
-      std::vector<float> host_input_grad = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_input_grad));
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index 1ea740f336..1c389cb20d 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -1,5 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/reduction_kernels.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
@@ -7,20 +8,22 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Reduction Forward and Backward Kernel") {
     std::size_t num_replicas = 5;
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10, 10, 10, 10, 10});
+    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        {10, 10, 10, 10, 10}, DataType::FLOAT);
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     SUBCASE("forward_kernel") {
-      TensorShape output_shape = make_float_tensor_shape_from_legion_dims({10});
+      TensorShape output_shape =
+          make_tensor_shape_from_legion_dims({10}, DataType::FLOAT);
 
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
@@ -29,30 +32,22 @@ TEST_SUITE(FF_TEST_SUITE) {
                                          output_accessor,
                                          num_replicas);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       TensorShape output_shape = input_shape;
 
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
+      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
+          output_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
       Kernels::Reduction::backward_kernel(managed_stream.raw_stream(),
-                                          input_grad_accessor,
-                                          output_grad_accessor);
-
-      std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements(), 1.0f);
-      std::vector<float> host_grad_data = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(host_grad_data == expected_grad_input_data);
+                                          output_grad_accessor,
+                                          input_grad_accessor);
+
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 86d790f03c..27223cc7b5 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -1,55 +1,113 @@
 #include "doctest/doctest.h"
 #include "kernels/replicate_kernels.h"
+#include "kernels/replicate_kernels_cpu.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Test Replicate Kernel") {
+  TEST_CASE("Call Replicate Forward and Backward Kernels") {
     std::size_t num_replicas = 10;
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
-    TensorShape output_shape = input_shape;
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
+    TensorShape output_shape =
+        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Replicate::forward_kernel(
           managed_stream.raw_stream(), input_accessor, output_accessor);
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements(), 1.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 1.0f);
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
+          create_random_filled_accessor_r(output_shape, allocator);
+      GenericTensorAccessorW input_grad_accessor =
+          allocator.allocate_tensor(input_shape);
 
       Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
-                                          input_grad_accessor,
                                           output_grad_accessor,
+                                          input_grad_accessor,
+                                          num_replicas);
+
+      CHECK(contains_non_zero(input_grad_accessor));
+    }
+  }
+
+  TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") {
+    std::size_t num_replicas = 2;
+
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({5}, DataType::FLOAT);
+    TensorShape output_shape =
+        make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT);
+
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("forward_kernel") {
+      // Run GPU Replicate Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          create_zero_filled_accessor_w(output_shape, gpu_allocator);
+
+      Kernels::Replicate::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      // Run CPU Replicate Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          create_zero_filled_accessor_w(output_shape, cpu_allocator);
+
+      Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu,
+                                             output_accessor_cpu);
+
+      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Replicate Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r(output_shape, gpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          create_zero_filled_accessor_w(input_shape, gpu_allocator);
+
+      Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
+                                          output_grad_accessor_gpu,
+                                          input_grad_accessor_gpu,
                                           num_replicas);
 
-      std::vector<float> check_aggregated_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(check_aggregated_data));
+      // Run CPU Replicate Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          create_zero_filled_accessor_w(input_shape, cpu_allocator);
+
+      Kernels::Replicate::cpu_backward_kernel(
+          output_grad_accessor_cpu, input_grad_accessor_cpu, num_replicas);
+
+      CHECK(accessors_are_equal(input_grad_accessor_gpu,
+                                input_grad_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc
index f56bfacc2b..5c04012da2 100644
--- a/lib/kernels/test/src/test_reshape_kernel.cc
+++ b/lib/kernels/test/src/test_reshape_kernel.cc
@@ -5,12 +5,15 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Reshape Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     ReshapePerDeviceState state =
@@ -18,42 +21,28 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Reshape::forward_kernel(
           managed_stream.raw_stream(), state, input_accessor, output_accessor);
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements(), 1.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 2.0f);
+          allocator.allocate_tensor(input_shape);
 
       Kernels::Reshape::backward_kernel(managed_stream.raw_stream(),
                                         state,
-                                        input_grad_accessor,
-                                        output_grad_accessor);
-
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
+                                        output_grad_accessor,
+                                        input_grad_accessor);
 
-      std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements(), 3.0f);
-      CHECK(host_grad_input_data == expected_grad_input_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index cdaf65a305..4adf79847a 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -1,5 +1,7 @@
 #include "doctest/doctest.h"
 #include "kernels/reverse_kernels.h"
+#include "kernels/reverse_kernels_cpu.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
@@ -9,18 +11,21 @@ TEST_SUITE(FF_TEST_SUITE) {
     std::size_t in_blk_size = 10;
     std::size_t num_out_blks = 1;
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
+    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+          read_only_accessor_from_write_accessor(create_filled_accessor_w(
+              input_shape, allocator, make_float_data_type_value(1)));
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
@@ -32,17 +37,14 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        in_blk_size,
                                        input_accessor.shape.num_elements());
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(check_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_accessor =
           create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
+          allocator.allocate_tensor(input_shape);
 
       Kernels::Reverse::backward_kernel(
           managed_stream.raw_stream(),
@@ -53,10 +55,85 @@ TEST_SUITE(FF_TEST_SUITE) {
           in_blk_size,
           input_grad_accessor.shape.num_elements());
 
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_grad_input_data));
+      CHECK(contains_non_zero(input_grad_accessor));
+    }
+  }
+
+  TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") {
+    std::size_t num_out_blks = 4;
+    std::size_t reverse_dim_size = 3;
+    std::size_t in_blk_size = 2;
+
+    TensorShape input_shape = make_tensor_shape_from_legion_dims(
+        {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT);
+    TensorShape output_shape = input_shape;
+
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("forward_kernel") {
+      auto transform = [counter = 0.0f](float val) mutable {
+        return counter++;
+      };
+
+      // Run GPU Cast Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          create_zero_filled_accessor_w(output_shape, gpu_allocator);
+
+      Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
+                                       input_accessor_gpu.get_float_ptr(),
+                                       output_accessor_gpu.get_float_ptr(),
+                                       num_out_blks,
+                                       reverse_dim_size,
+                                       in_blk_size,
+                                       input_accessor_gpu.shape.num_elements());
+
+      // Run CPU Cast Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          create_zero_filled_accessor_w(output_shape, cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu,
+                                           output_accessor_cpu);
+
+      CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu));
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Cast Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r(output_shape, gpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          create_zero_filled_accessor_w(input_shape, gpu_allocator);
+
+      Kernels::Reverse::backward_kernel(
+          managed_stream.raw_stream(),
+          output_grad_accessor_gpu.get_float_ptr(),
+          input_grad_accessor_gpu.get_float_ptr(),
+          num_out_blks,
+          reverse_dim_size,
+          in_blk_size,
+          input_grad_accessor_gpu.shape.num_elements());
+
+      // Run CPU Cast Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          create_zero_filled_accessor_w(input_shape, cpu_allocator);
+
+      Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu,
+                                            input_grad_accessor_cpu);
+
+      CHECK(accessors_are_equal(input_grad_accessor_gpu,
+                                input_grad_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index f49c1ebbcc..5519c30b80 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -8,12 +8,15 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Softmax Kernel Operations") {
     int input_n = 1, input_c = 1, input_h = 1, input_w = 100, channels = 100;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SoftmaxPerDeviceState state = Kernels::Softmax::init_kernel(
@@ -31,30 +34,22 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        input_accessor.get_float_ptr(),
                                        output_accessor.get_float_ptr());
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW output_grad_accessor =
-          create_filled_accessor_w(output_shape, allocator, 1.0f);
+      GenericTensorAccessorR output_grad_accessor =
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
       Kernels::Softmax::backward_kernel(
           managed_stream.raw_stream(),
-          input_grad_accessor.get_float_ptr(),
           output_grad_accessor.get_float_ptr(),
+          input_grad_accessor.get_float_ptr(),
           output_grad_accessor.shape.num_elements());
 
-      std::vector<float> expected_input_grad_data =
-          std::vector<float>(input_grad_accessor.shape.num_elements(), 1.0f);
-      std::vector<float> host_input_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(host_input_grad_data == expected_input_grad_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index 7cc2b28c9e..34993fa151 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -1,6 +1,8 @@
 #include "doctest/doctest.h"
 #include "kernels/split_kernels.h"
+#include "op-attrs/make_datatype_value.h"
 #include "test_utils.h"
+#include "utils/containers/repeat.h"
 
 using namespace ::FlexFlow;
 
@@ -11,13 +13,17 @@ TEST_SUITE(FF_TEST_SUITE) {
     coord_t in_blk_size = 100;
     coord_t num_blks = 1;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50});
+    TensorShape input_shape =
+        make_tensor_shape_from_legion_dims({100}, DataType::FLOAT);
+    TensorShape output_shape =
+        make_tensor_shape_from_legion_dims({50}, DataType::FLOAT);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
@@ -46,8 +52,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         output_grad_ptrs[i] = output_grad_accessor.get_float_ptr();
       }
 
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 0.0f);
+      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
+          input_shape, allocator, make_float_data_type_value(0));
 
       Kernels::Split::backward_kernel(managed_stream.raw_stream(),
                                       input_grad_accessor.get_float_ptr(),
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index 2fc186a257..0bc85cb8e0 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -7,9 +7,11 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Transpose Kernel Operations") {
     std::size_t num_dims = 2;
 
-    std::vector<ff_dim_t> perm = {ff_dim_t(0), ff_dim_t(1)};
+    std::vector<ff_dim_t> perm = {ff_dim_t{0}, ff_dim_t{1}};
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -18,41 +20,33 @@ TEST_SUITE(FF_TEST_SUITE) {
         Kernels::Transpose::init_kernel(num_dims, perm);
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10, 10});
+        make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT);
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Transpose::forward_kernel(
           managed_stream.raw_stream(), state, input_accessor, output_accessor);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
 
       Kernels::Transpose::backward_kernel(managed_stream.raw_stream(),
                                           state,
-                                          input_grad_accessor,
-                                          output_grad_accessor);
+                                          output_grad_accessor,
+                                          input_grad_accessor);
 
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_grad_input_data));
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
index b591642570..bfed1241ba 100644
--- a/lib/kernels/test/src/test_utils.cc
+++ b/lib/kernels/test/src/test_utils.cc
@@ -1,105 +1,249 @@
 #include "test_utils.h"
+#include "op-attrs/tensor_shape.h"
+#include <random>
 
-GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
-                                                       Allocator &allocator,
-                                                       bool cpu_fill) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements();
-  std::vector<float> host_data(volume);
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-
-  for (auto &val : host_data) {
-    val = dist(gen);
+namespace FlexFlow {
+
+GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
+                                                     Allocator &allocator) {
+  GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape);
+  fill_with_zeros(result_accessor);
+  return result_accessor;
+}
+
+TensorShape
+    make_tensor_shape_from_legion_dims(LegionOrdered<size_t> const &dims,
+                                       DataType DT) {
+  return TensorShape{
+      TensorDims{
+          ff_ordered_from_legion_ordered(dims),
+      },
+      DT,
+  };
+}
+
+template <DataType DT>
+struct CreateRandomFilledAccessorW {
+  GenericTensorAccessorW operator()(TensorShape const &shape,
+                                    Allocator &allocator) {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape);
+
+    using T = real_type_t<DT>;
+    T *data_ptr = src_accessor.get<DT>();
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    size_t num_elements = get_num_elements(shape);
+    if constexpr (std::is_same<T, bool>::value) {
+      std::bernoulli_distribution dist(0.5);
+      for (size_t i = 0; i < num_elements; i++) {
+        data_ptr[i] = dist(gen);
+      }
+    } else if constexpr (std::is_floating_point<T>::value) {
+      std::uniform_real_distribution<T> dist(-1.0, 1.0);
+      for (size_t i = 0; i < num_elements; i++) {
+        data_ptr[i] = dist(gen);
+      }
+    } else if constexpr (std::is_integral<T>::value) {
+      std::uniform_int_distribution<T> dist(0, 100);
+      for (size_t i = 0; i < num_elements; i++) {
+        data_ptr[i] = dist(gen);
+      }
+    }
+
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+
+    return dst_accessor;
   }
+};
 
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
+GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
+                                                       Allocator &allocator) {
+  return DataTypeDispatch1<CreateRandomFilledAccessorW>{}(
+      shape.data_type, shape, allocator);
+}
+
+GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape,
+                                                       Allocator &allocator) {
+  GenericTensorAccessorW accessor =
+      create_random_filled_accessor_w(shape, allocator);
+
+  return read_only_accessor_from_write_accessor(accessor);
+}
+
+template <DataType DT>
+struct FillWithZeros {
+  void operator()(GenericTensorAccessorW const &accessor) {
+    using T = real_type_t<DT>;
+
+    if (accessor.device_type == DeviceType::CPU) {
+      memset(accessor.ptr, 0, accessor.shape.get_volume() * sizeof(T));
+    } else {
+      checkCUDA(
+          cudaMemset(accessor.ptr, 0, accessor.shape.get_volume() * sizeof(T)));
+    }
   }
+};
 
-  return accessor;
+void fill_with_zeros(GenericTensorAccessorW const &accessor) {
+  DataTypeDispatch1<FillWithZeros>{}(accessor.data_type, accessor);
 }
 
-GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
-                                                Allocator &allocator,
-                                                float val,
-                                                bool cpu_fill) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements();
-  std::vector<float> host_data(volume, val);
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
+template <DataType DT>
+struct CPUAccessorRContainsNonZero {
+  bool operator()(GenericTensorAccessorR const &accessor) {
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = accessor.get<DT>();
+
+    for (size_t i = 0; i < accessor.shape.num_elements(); i++) {
+      if (data_ptr[i] != 0) {
+        return true;
+      }
+    }
+
+    return false;
   }
+};
 
-  return accessor;
+bool contains_non_zero(GenericTensorAccessorR const &accessor) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR cpu_accessor =
+      copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator);
+  return DataTypeDispatch1<CPUAccessorRContainsNonZero>{}(
+      cpu_accessor.data_type, cpu_accessor);
 }
 
-GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape,
-                                                     Allocator &allocator,
-                                                     bool cpu_fill) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements();
-  std::vector<float> host_data(volume);
+GenericTensorAccessorR
+    copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor,
+                                        Allocator &cpu_allocator) {
+  GenericTensorAccessorR cpu_accessor = accessor;
+  if (accessor.device_type == DeviceType::GPU) {
+    cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator);
+  }
+  return cpu_accessor;
+}
 
-  for (size_t i = 0; i < volume; i++) {
-    host_data[i] = i;
+GenericTensorAccessorW
+    copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor,
+                                        Allocator &cpu_allocator) {
+  GenericTensorAccessorW cpu_accessor = accessor;
+  if (accessor.device_type == DeviceType::GPU) {
+    cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator);
   }
+  return cpu_accessor;
+}
+
+template <DataType DT>
+struct Print2DCPUAccessorR {
+  void operator()(GenericTensorAccessorR const &accessor,
+                  std::ostream &stream) {
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = accessor.get<DT>();
+    int rows = accessor.shape.at(legion_dim_t{0});
+    int cols = accessor.shape.at(legion_dim_t{1});
 
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < cols; j++) {
+        stream << data_ptr[i * cols + j];
+
+        if (j < cols - 1) {
+          stream << " ";
+        }
+      }
+      stream << std::endl;
+    }
   }
+};
 
-  return accessor;
+void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor,
+                                       std::ostream &stream) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR cpu_accessor =
+      copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator);
+  DataTypeDispatch1<Print2DCPUAccessorR>{}(
+      accessor.data_type, accessor, stream);
 }
 
-void fill_tensor_accessor_w(GenericTensorAccessorW accessor,
-                            float val,
-                            bool cpu_fill) {
-  LegionTensorDims dims = accessor.shape.dims;
-  size_t volume = accessor.shape.num_elements();
-  std::vector<float> host_data(volume, val);
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
+template <DataType DT>
+struct AccessorsAreEqual {
+  bool operator()(GenericTensorAccessorR const &accessor_a,
+                  GenericTensorAccessorR const &accessor_b) {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    GenericTensorAccessorR cpu_accessor_a =
+        copy_accessor_r_to_cpu_if_necessary(accessor_a, cpu_allocator);
+    GenericTensorAccessorR cpu_accessor_b =
+        copy_accessor_r_to_cpu_if_necessary(accessor_b, cpu_allocator);
+
+    using T = real_type_t<DT>;
+    T const *a_data_ptr = cpu_accessor_a.get<DT>();
+    T const *b_data_ptr = cpu_accessor_b.get<DT>();
+
+    for (size_t i = 0; i < accessor_a.shape.num_elements(); i++) {
+      if (a_data_ptr[i] != b_data_ptr[i]) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+};
+
+bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
+                         GenericTensorAccessorR const &accessor_b) {
+  if (accessor_a.shape != accessor_b.shape) {
+    throw mk_runtime_error(
+        fmt::format("accessors_are_equal expected accessors to have the same "
+                    "shape, but received: {} != {}",
+                    accessor_a.shape,
+                    accessor_b.shape));
   }
+  return DataTypeDispatch1<AccessorsAreEqual>{}(
+      accessor_a.data_type, accessor_a, accessor_b);
 }
 
-TensorShape make_float_tensor_shape_from_legion_dims(FFOrdered<size_t> dims) {
-  return TensorShape{
-      TensorDims{
-          dims,
-      },
-      DataType::FLOAT,
-  };
+template <DataType DT>
+struct CreateFilledAccessorW {
+  GenericTensorAccessorW operator()(TensorShape const &shape,
+                                    Allocator &allocator,
+                                    DataTypeValue val) {
+    using T = real_type_t<DT>;
+    if (!val.template has<T>()) {
+      throw mk_runtime_error("create_filed_accessor expected data type of "
+                             "shape and passed-in value to match");
+    }
+
+    auto unwrapped_value = val.get<T>();
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape);
+
+    T *data_ptr = src_accessor.get<DT>();
+    for (size_t i = 0; i < dst_accessor.shape.num_elements(); i++) {
+      data_ptr[i] = unwrapped_value;
+    }
+
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+    return dst_accessor;
+  }
+};
+
+GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val) {
+
+  return DataTypeDispatch1<CreateFilledAccessorW>{}(
+      shape.data_type, shape, allocator, val);
 }
 
-TensorShape make_double_tensor_shape_from_legion_dims(FFOrdered<size_t> dims) {
-  return TensorShape{
-      TensorDims{
-          dims,
-      },
-      DataType::DOUBLE,
-  };
+GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val) {
+  GenericTensorAccessorW w_accessor =
+      create_filled_accessor_w(shape, allocator, val);
+  return read_only_accessor_from_write_accessor(w_accessor);
 }
+} // namespace FlexFlow
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
index abce3fd444..d23b936cb0 100644
--- a/lib/kernels/test/src/test_utils.h
+++ b/lib/kernels/test/src/test_utils.h
@@ -1,48 +1,55 @@
 #ifndef _FLEXFLOW_KERNELS_TEST_UTILS
 #define _FLEXFLOW_KERNELS_TEST_UTILS
 
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
 #include "kernels/device.h"
+#include "kernels/local_cpu_allocator.h"
 #include "kernels/local_cuda_allocator.h"
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
-#include <random>
+#include "op-attrs/datatype.h"
+#include "op-attrs/datatype_value.dtg.h"
+
+namespace FlexFlow {
 
 GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
-                                                       Allocator &allocator,
-                                                       bool cpu_fill = false);
+                                                       Allocator &allocator);
 
-GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
-                                                Allocator &allocator,
-                                                float val,
-                                                bool cpu_fill = false);
+GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape,
+                                                       Allocator &allocator);
+
+GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
+                                                     Allocator &allocator);
+
+TensorShape
+    make_tensor_shape_from_legion_dims(LegionOrdered<size_t> const &dims,
+                                       DataType DT);
 
-GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape,
-                                                     Allocator &allocator,
-                                                     bool cpu_fill = false);
+bool contains_non_zero(GenericTensorAccessorR const &accessor);
 
-void fill_tensor_accessor_w(GenericTensorAccessorW accessor,
-                            float val,
-                            bool cpu_fill = false);
+void fill_with_zeros(GenericTensorAccessorW const &accessor);
 
-TensorShape make_float_tensor_shape_from_legion_dims(FFOrdered<size_t> dims);
+GenericTensorAccessorW
+    copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor,
+                                        Allocator &allocator);
 
-TensorShape make_double_tensor_shape_from_legion_dims(FFOrdered<size_t> dims);
+GenericTensorAccessorR
+    copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor,
+                                        Allocator &allocator);
 
-template <typename T>
-std::vector<T> load_data_to_host_from_device(GenericTensorAccessorR accessor) {
-  int volume = accessor.shape.get_volume();
+void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor);
 
-  std::vector<T> local_data(volume);
-  checkCUDA(cudaMemcpy(local_data.data(),
-                       accessor.ptr,
-                       local_data.size() * sizeof(T),
-                       cudaMemcpyDeviceToHost));
-  return local_data;
-}
+bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
+                         GenericTensorAccessorR const &accessor_b);
 
-template <typename T>
-bool contains_non_zero(std::vector<T> &data) {
-  return !all_of(data, [](T const &val) { return val == 0; });
-}
+GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val);
+
+GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val);
+} // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/local_cpu_allocator.h b/lib/local-execution/include/local-execution/local_cpu_allocator.h
index d1e81facf2..cf6cfe35d1 100644
--- a/lib/local-execution/include/local-execution/local_cpu_allocator.h
+++ b/lib/local-execution/include/local-execution/local_cpu_allocator.h
@@ -12,6 +12,8 @@ struct LocalCPUAllocator : public IAllocator {
   void *allocate(size_t) override;
   void deallocate(void *) override;
 
+  DeviceType get_allocation_device_type() const override;
+
 private:
   std::unordered_map<void *, std::unique_ptr<void, decltype(&free)>> ptrs;
 };
diff --git a/lib/local-execution/include/local-execution/per_device_op_state.h b/lib/local-execution/include/local-execution/per_device_op_state.h
index 1edd5b6360..f1f357a86e 100644
--- a/lib/local-execution/include/local-execution/per_device_op_state.h
+++ b/lib/local-execution/include/local-execution/per_device_op_state.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H
 #define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H
 
+#include "kernels/per_device_op_state.dtg.h"
 #include "local-execution/device_specific_device_states.dtg.h"
-#include "local-execution/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
index 54c8dfc5f1..48584588e3 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 
+#include "kernels/per_device_op_state.dtg.h"
 #include "local-execution/device_specific.h"
 #include "local-execution/itask_argument_accessor.h"
-#include "local-execution/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h
index 731e04fdc8..f697337c52 100644
--- a/lib/local-execution/include/local-execution/tracked_allocator.h
+++ b/lib/local-execution/include/local-execution/tracked_allocator.h
@@ -13,6 +13,9 @@ struct TrackedAllocator : public IAllocator {
 
   void *allocate(size_t) override;
   void deallocate(void *) override;
+
+  DeviceType get_allocation_device_type() const override;
+
   size_t get_current_mem_usage();
 
 private:
diff --git a/lib/local-execution/src/local_cpu_allocator.cc b/lib/local-execution/src/local_cpu_allocator.cc
index 4ca5f987a8..c4657e26b5 100644
--- a/lib/local-execution/src/local_cpu_allocator.cc
+++ b/lib/local-execution/src/local_cpu_allocator.cc
@@ -17,6 +17,10 @@ void LocalCPUAllocator::deallocate(void *ptr) {
   }
 }
 
+DeviceType LocalCPUAllocator::get_allocation_device_type() const {
+  return DeviceType::CPU;
+}
+
 Allocator create_local_cpu_memory_allocator() {
   return Allocator::create<LocalCPUAllocator>();
 }
diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc
index 54eca7e514..5d099c6b46 100644
--- a/lib/local-execution/src/local_task_argument_accessor.cc
+++ b/lib/local-execution/src/local_task_argument_accessor.cc
@@ -24,8 +24,8 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
   auto tensor_backing = std::get<GenericTensorAccessorW>(
       this->tensor_slots_backing.at(slot_grad_pair));
   if (priv == Permissions::RO) {
-    GenericTensorAccessorR readonly_tensor_backing = {
-        tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr};
+    GenericTensorAccessorR readonly_tensor_backing =
+        read_only_accessor_from_write_accessor(tensor_backing);
     return readonly_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
     return tensor_backing;
@@ -33,6 +33,7 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
     throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv));
   }
 }
+
 VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
     slot_id_t slot, Permissions priv, IsGrad is_grad) const {
   SlotGradId slot_grad_pair = SlotGradId{slot, is_grad};
@@ -43,7 +44,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
     for (GenericTensorAccessorW const &tensor_backing :
          variadic_tensor_backing) {
       readonly_variadic_tensor_backing.push_back(
-          {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr});
+          read_only_accessor_from_write_accessor(tensor_backing));
     }
     return readonly_variadic_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc
index 851566fc02..3aed3111c7 100644
--- a/lib/local-execution/src/ops/batch_norm.cc
+++ b/lib/local-execution/src/ops/batch_norm.cc
@@ -133,9 +133,9 @@ static std::optional<float>
                  profiling,
                  "[BatchNorm] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 input.get_float_ptr(),
-                 output_grad.get_float_ptr(),
                  output.get_float_ptr(),
+                 output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
                  input_grad.get_float_ptr(),
                  scale.get_float_ptr(),
                  scale_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc
index 3e7baf49a9..e9adf88422 100644
--- a/lib/local-execution/src/ops/cast.cc
+++ b/lib/local-execution/src/ops/cast.cc
@@ -54,9 +54,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  profiling,
                  "[Cast] forward_time = {:.2lf}ms\n",
                  input,
-                 output,
-                 input.data_type,
-                 attrs.dtype);
+                 output);
 }
 
 static std::optional<float>
@@ -73,9 +71,7 @@ static std::optional<float>
                  profiling,
                  "[Cast] forward_time = {:.2lf}ms\n",
                  input_grad,
-                 output_grad,
-                 input.data_type,
-                 attrs.dtype);
+                 output_grad);
 }
 
 TaskImplFunction get_cast_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc
index d5c6e7f851..d7c5c22170 100644
--- a/lib/local-execution/src/ops/conv_2d.cc
+++ b/lib/local-execution/src/ops/conv_2d.cc
@@ -108,8 +108,8 @@ static std::optional<float>
       acc.get_argument<Conv2DPerDeviceState>(PER_DEVICE_STATE);
   auto attrs = acc.get_argument<Conv2DAttrs>(ATTRS);
 
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto filter = acc.get_tensor<Permissions::RO>(FILTER);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
@@ -121,10 +121,10 @@ static std::optional<float>
                  profiling,
                  "[Conv2d] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
                  output.get_float_ptr(),
                  output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr(),
                  filter.get_float_ptr(),
                  filter_grad.get_float_ptr(),
                  bias_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc
index 4ee609bd6c..10f1dce294 100644
--- a/lib/local-execution/src/ops/element_unary.cc
+++ b/lib/local-execution/src/ops/element_unary.cc
@@ -89,10 +89,10 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
 
   auto const &attrs = acc.get_argument<ElementUnaryAttrs>(ATTRS);
   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
@@ -107,10 +107,10 @@ static std::optional<float>
                  per_device_state,
                  attrs,
                  handle,
-                 input,
-                 input_grad,
                  output,
-                 output_grad);
+                 output_grad,
+                 input,
+                 input_grad);
 }
 
 TaskImplFunction get_element_unary_init_task_impl() {
diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc
index 3fe5029fa1..8d998a8672 100644
--- a/lib/local-execution/src/ops/flat.cc
+++ b/lib/local-execution/src/ops/flat.cc
@@ -41,15 +41,15 @@ static std::optional<float>
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
 
   return profile(backward_kernel,
                  profiling,
                  "[Flat] backward_time = {:.2lf}ms\n",
                  input,
-                 input_grad.get_float_ptr(),
-                 output_grad.get_float_ptr());
+                 output_grad.get_float_ptr(),
+                 input_grad.get_float_ptr());
 }
 
 TaskImplFunction get_flat_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
index 9934e2a45c..b567937c70 100644
--- a/lib/local-execution/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -125,17 +125,17 @@ static std::optional<float>
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto bias = acc.get_tensor<Permissions::RO>(BIAS);
+  auto bias = acc.get_tensor<Permissions::RW>(BIAS);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RW>(OUTPUT);
   auto per_device_state =
       acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
 
-  float const *bias_ptr = NULL;
+  float *bias_ptr = NULL;
   if (attrs.use_bias) {
     bias_ptr = bias.get_float_ptr();
   }
@@ -148,13 +148,13 @@ static std::optional<float>
                  profiling,
                  "[Linear] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 (void *)input.get_float_ptr(),
-                 (void *)input_grad.get_float_ptr(),
-                 (void *)output.get_float_ptr(),
-                 (void *)output_grad.get_float_ptr(),
-                 (void *)weight.get_float_ptr(),
-                 (void *)weight_grad.get_float_ptr(),
-                 (void *)bias_ptr,
+                 output.get_float_ptr(),
+                 output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr(),
+                 weight.get_float_ptr(),
+                 weight_grad.get_float_ptr(),
+                 bias_ptr,
                  in_dim,
                  out_dim,
                  batch_size);
diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc
index 33d62b713c..2e7fb8ce91 100644
--- a/lib/local-execution/src/ops/pool_2d.cc
+++ b/lib/local-execution/src/ops/pool_2d.cc
@@ -30,14 +30,14 @@ static DeviceSpecificDeviceStates
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  int input_w = input.shape.at(ff_dim_t(0)) + 1;
-  int input_h = input.shape.at(ff_dim_t(1)) + 1;
-  int input_c = input.shape.at(ff_dim_t(2)) + 1;
-  int input_n = input.shape.at(ff_dim_t(3)) + 1;
-  int output_w = output.shape.at(ff_dim_t(0)) + 1;
-  int output_h = output.shape.at(ff_dim_t(1)) + 1;
-  int output_c = output.shape.at(ff_dim_t(2)) + 1;
-  int output_n = output.shape.at(ff_dim_t(3)) + 1;
+  int input_w = input.shape.at(ff_dim_t{0}) + 1;
+  int input_h = input.shape.at(ff_dim_t{1}) + 1;
+  int input_c = input.shape.at(ff_dim_t{2}) + 1;
+  int input_n = input.shape.at(ff_dim_t{3}) + 1;
+  int output_w = output.shape.at(ff_dim_t{0}) + 1;
+  int output_h = output.shape.at(ff_dim_t{1}) + 1;
+  int output_c = output.shape.at(ff_dim_t{2}) + 1;
+  int output_n = output.shape.at(ff_dim_t{3}) + 1;
 
   printf("init pool (input): n(%d) c(%d) h(%d) "
          "w(%d)\n",
@@ -125,19 +125,19 @@ static std::optional<float>
   Pool2DPerDeviceState state =
       acc.get_argument<Pool2DPerDeviceState>(PER_DEVICE_STATE);
 
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor<Permissions::RW>(INPUT);
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto output_grad = acc.get_tensor<Permissions::RO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor<Permissions::RW>(INPUT);
 
   return profile(backward_kernel,
                  profiling,
                  "[Pool2D] backward_time = {:.2lf}ms\n",
                  state,
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
                  output.get_float_ptr(),
-                 output_grad.get_float_ptr());
+                 output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr());
 }
 
 TaskImplFunction get_pool_2d_init_task_impl() {
diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc
index a58d79a4f8..1e85d7186e 100644
--- a/lib/local-execution/src/ops/reduction.cc
+++ b/lib/local-execution/src/ops/reduction.cc
@@ -64,13 +64,13 @@ static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   return profile(backward_kernel,
                  profiling,
                  "[Reduction] backward_time = {:.2lf}ms\n",
-                 input_grad,
-                 output_grad);
+                 output_grad,
+                 input_grad);
 }
 
 TaskImplFunction get_reduction_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc
index 73692f4a13..655e1f238b 100644
--- a/lib/local-execution/src/ops/repartition.cc
+++ b/lib/local-execution/src/ops/repartition.cc
@@ -86,8 +86,8 @@ static std::optional<float>
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<RepartitionPerDeviceState>(PER_DEVICE_STATE);
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
 
   return profile(backward_kernel,
                  profiling,
diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc
index 135475a711..56bbfdd371 100644
--- a/lib/local-execution/src/ops/replicate.cc
+++ b/lib/local-execution/src/ops/replicate.cc
@@ -67,8 +67,8 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[replicate] backward_time = {:.2lf}ms\n",
-                 input_grad,
                  output_grad,
+                 input_grad,
                  attrs.replicate_degree);
 }
 
diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc
index 7584d405eb..761718a9a7 100644
--- a/lib/local-execution/src/ops/reshape.cc
+++ b/lib/local-execution/src/ops/reshape.cc
@@ -87,8 +87,8 @@ static std::optional<float>
                  profiling,
                  "[Reshape] backward time = {:.2lf}ms\n",
                  per_device_state,
-                 input_grad,
-                 output_grad);
+                 output_grad,
+                 input_grad);
 }
 
 TaskImplFunction get_reshape_init_task_impl() {
diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc
index 366a579bea..bb1b802edd 100644
--- a/lib/local-execution/src/ops/reverse.cc
+++ b/lib/local-execution/src/ops/reverse.cc
@@ -53,11 +53,11 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1;
   for (int i = 0; i < output.shape.get_dim(); i++) {
     if (i < axis.value) {
-      in_blk_size *= output.shape.at(ff_dim_t(i));
+      in_blk_size *= output.shape.at(ff_dim_t{i});
     } else if (i == axis.value) {
-      reverse_dim_size = output.shape.at(ff_dim_t(i));
+      reverse_dim_size = output.shape.at(ff_dim_t{i});
     } else {
-      num_out_blks *= output.shape.at(ff_dim_t(i));
+      num_out_blks *= output.shape.at(ff_dim_t{i});
     }
   }
 
@@ -83,11 +83,11 @@ static std::optional<float>
   coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1;
   for (int i = 0; i < input_grad.shape.get_dim(); i++) {
     if (i < axis) {
-      in_blk_size *= input_grad.shape.at(ff_dim_t(i));
+      in_blk_size *= input_grad.shape.at(ff_dim_t{i});
     } else if (i == axis) {
-      reverse_dim_size = input_grad.shape.at(ff_dim_t(i));
+      reverse_dim_size = input_grad.shape.at(ff_dim_t{i});
     } else {
-      num_out_blks *= input_grad.shape.at(ff_dim_t(i));
+      num_out_blks *= input_grad.shape.at(ff_dim_t{i});
     }
   }
 
diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc
index 4c7979ae9b..9c5757112c 100644
--- a/lib/local-execution/src/ops/softmax.cc
+++ b/lib/local-execution/src/ops/softmax.cc
@@ -102,8 +102,8 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[SoftMax] backward_time = {:.2lf}ms\n",
-                 input_grad.get_float_ptr(),
                  output_grad.get_float_ptr(),
+                 input_grad.get_float_ptr(),
                  output_grad.shape.get_volume());
 }
 
diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc
index 3e4ac15db3..0176e6d578 100644
--- a/lib/local-execution/src/ops/transpose.cc
+++ b/lib/local-execution/src/ops/transpose.cc
@@ -88,8 +88,8 @@ static std::optional<float>
                  profiling,
                  "[Transpose] Backward_time = {:.2lf} [ms]",
                  per_device_state,
-                 input_grad,
-                 output_grad);
+                 output_grad,
+                 input_grad);
 }
 
 OpTaskInvocation backward(TransposeAttrs const &attrs) {
diff --git a/lib/local-execution/src/per_device_state.cc b/lib/local-execution/src/per_device_op_state.cc
similarity index 100%
rename from lib/local-execution/src/per_device_state.cc
rename to lib/local-execution/src/per_device_op_state.cc
diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc
index e6c3a11711..ed181aea32 100644
--- a/lib/local-execution/src/tracked_allocator.cc
+++ b/lib/local-execution/src/tracked_allocator.cc
@@ -23,8 +23,13 @@ size_t TrackedAllocator::get_current_mem_usage() {
   return this->current_mem_usage;
 }
 
+DeviceType TrackedAllocator::get_allocation_device_type() const {
+  return this->allocator.get_allocation_device_type();
+}
+
 Allocator get_tracked_memory_allocator(Allocator const &base_allocator) {
-  return Allocator::create<TrackedAllocator>(base_allocator);
+  Allocator allocator = Allocator::create<TrackedAllocator>(base_allocator);
+  return allocator;
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index da3af6e3ad..512c1ef33b 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -12,7 +12,11 @@
 // TEST_SUITE(FF_CUDA_TEST_SUITE) {
 //   TEST_CASE("Local Cost Estimator") {
 //     // local backing initialization
-//     ManagedPerDeviceFFHandle managed_handle{};
+//     ManagedPerDeviceFFHandle managed_handle{
+/*workSpaceSize=*/1024 * 1024,
+    /*allowTensorOpMathConversion=*/true
+}
+;
 
 //     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
 //         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
diff --git a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
index 27aa50f38f..2c524c120a 100644
--- a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
+++ b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
@@ -10,5 +10,8 @@ features = [
 [[values]]
 name = "SUM"
 
-[[value]]
+[[values]]
 name = "AVG"
+
+[[values]]
+name = "NONE"
diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h
new file mode 100644
index 0000000000..723e69bddd
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/datatype_value.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H
+
+#include "op-attrs/datatype_value.dtg.h"
+
+namespace FlexFlow {
+
+DataTypeValue make_float_data_type_value(float value);
+DataTypeValue make_double_data_type_value(double value);
+DataTypeValue make_int32_data_type_value(int32_t value);
+DataTypeValue make_int64_data_type_value(int64_t value);
+DataTypeValue make_bool_data_type_value(bool value);
+
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
index 6aa23d40fc..19a6e62178 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
@@ -175,8 +175,9 @@ auto inner_to_outer(FFOrdered<T> const &ff_ordered)
 template <typename T>
 std::vector<ff_dim_t> inner_to_outer_idxs(FFOrdered<T> const &ff_ordered) {
   std::vector<ff_dim_t> idxs;
-  for (size_t i = 0; i < ff_ordered.size(); i++) {
-    idxs.push_back(ff_dim_t(ff_ordered.size() - i - 1));
+  int size = static_cast<int>(ff_ordered.size());
+  for (int i = 0; i < ff_ordered.size(); i++) {
+    idxs.push_back(ff_dim_t{size - i - 1});
   }
   return idxs;
 }
diff --git a/lib/op-attrs/include/op-attrs/make_datatype_value.h b/lib/op-attrs/include/op-attrs/make_datatype_value.h
new file mode 100644
index 0000000000..af4792dd9e
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/make_datatype_value.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
+
+#include "op-attrs/datatype_value.dtg.h"
+
+namespace FlexFlow {
+
+DataTypeValue make_float_data_type_value(float value);
+DataTypeValue make_double_data_type_value(double value);
+DataTypeValue make_int32_data_type_value(int32_t value);
+DataTypeValue make_int64_data_type_value(int64_t value);
+DataTypeValue make_bool_data_type_value(bool value);
+
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
diff --git a/lib/op-attrs/src/op-attrs/make_datatype_value.cc b/lib/op-attrs/src/op-attrs/make_datatype_value.cc
new file mode 100644
index 0000000000..76d712949a
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/make_datatype_value.cc
@@ -0,0 +1,25 @@
+#include "op-attrs/make_datatype_value.h"
+
+namespace FlexFlow {
+
+DataTypeValue make_float_data_type_value(float value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_double_data_type_value(double value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_int32_data_type_value(int32_t value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_int64_data_type_value(int64_t value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_bool_data_type_value(bool value) {
+  return DataTypeValue{value};
+}
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ops/attention.cc b/lib/op-attrs/src/op-attrs/ops/attention.cc
index 483d832fee..8a806bcf9f 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention.cc
@@ -33,15 +33,15 @@ int get_oProjSize(MultiHeadAttentionAttrs const &attrs) {
 }
 
 int get_qSize(TensorShape const &query_shape) {
-  return dim_at_idx(query_shape, ff_dim_t(0));
+  return dim_at_idx(query_shape, ff_dim_t{0});
 }
 
 int get_kSize(TensorShape const &key_shape) {
-  return dim_at_idx(key_shape, ff_dim_t(0));
+  return dim_at_idx(key_shape, ff_dim_t{0});
 }
 
 int get_vSize(TensorShape const &value_shape) {
-  return dim_at_idx(value_shape, ff_dim_t(0));
+  return dim_at_idx(value_shape, ff_dim_t{0});
 }
 
 int get_qSize(MultiHeadAttentionParallelInputs const &inputs) {
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
index dcc567e0ca..6ea29b1855 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
@@ -138,7 +138,7 @@ std::unordered_set<parallel_tensor_dim_idx_t>
     get_parallel_tensor_dim_indices(ParallelTensorShape const &shape) {
   std::unordered_set<parallel_tensor_dim_idx_t> indices;
   extend(indices, transform(range(num_shard_dims(shape.dims)), [](int idx) {
-           return parallel_tensor_dim_idx_t(ff_dim_t(idx));
+           return parallel_tensor_dim_idx_t(ff_dim_t{idx});
          }));
   indices.insert(parallel_tensor_dim_idx_t(ReplicaType::SUM));
   indices.insert(parallel_tensor_dim_idx_t(ReplicaType::DISCARD_COPY));
diff --git a/lib/pcg/include/pcg/metric.h b/lib/pcg/include/pcg/metric.h
new file mode 100644
index 0000000000..718919112f
--- /dev/null
+++ b/lib/pcg/include/pcg/metric.h
@@ -0,0 +1,72 @@
+#ifndef _FF_METRICS_H_
+#define _FF_METRICS_H_
+
+#include "op-attrs/ops/loss_functions/loss_functions.h"
+#include "utils/fmt.h"
+#include <unordered_set>
+
+namespace FlexFlow {
+
+enum class Metric {
+  ACCURACY,
+  CATEGORICAL_CROSSENTROPY,
+  SPARSE_CATEGORICAL_CROSSENTROPY,
+  MEAN_SQUARED_ERROR,
+  ROOT_MEAN_SQUARED_ERROR,
+  MEAN_ABSOLUTE_ERROR,
+};
+
+class MetricsAttrs {
+public:
+  MetricsAttrs() = delete;
+  MetricsAttrs(LossFunction, std::vector<Metric> const &);
+
+public:
+  LossFunction loss_type;
+  bool measure_accuracy;
+  bool measure_categorical_crossentropy;
+  bool measure_sparse_categorical_crossentropy;
+  bool measure_mean_squared_error;
+  bool measure_root_mean_squared_error;
+  bool measure_mean_absolute_error;
+};
+
+} // namespace FlexFlow
+
+namespace fmt {
+
+template <>
+struct formatter<::FlexFlow::Metric> : formatter<string_view> {
+  template <typename FormatContext>
+  auto format(::FlexFlow::Metric m, FormatContext &ctx) const
+      -> decltype(ctx.out()) {
+    using namespace FlexFlow;
+
+    string_view name = "unknown";
+    switch (m) {
+      case Metric::ACCURACY:
+        name = "Accuracy";
+        break;
+      case Metric::CATEGORICAL_CROSSENTROPY:
+        name = "CategoricalCrossEntropy";
+        break;
+      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
+        name = "SparseCategoricalCrossEntropy";
+        break;
+      case Metric::MEAN_SQUARED_ERROR:
+        name = "MeanSquaredError";
+        break;
+      case Metric::ROOT_MEAN_SQUARED_ERROR:
+        name = "RootMeanSquaredError";
+        break;
+      case Metric::MEAN_ABSOLUTE_ERROR:
+        name = "MeanAbsoluteError";
+        break;
+    }
+    return formatter<string_view>::format(name, ctx);
+  }
+};
+
+} // namespace fmt
+
+#endif
diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc
index dff647f5a1..65ef214669 100644
--- a/lib/pcg/src/pcg/computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/computation_graph_builder.cc
@@ -3,6 +3,7 @@
 #include "op-attrs/get_incoming_tensor_roles.h"
 #include "op-attrs/get_op_type.h"
 #include "op-attrs/get_output_shapes.h"
+#include "op-attrs/make_datatype_value.h"
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/ops/batch_norm.h"
 #include "op-attrs/ops/broadcast.h"
@@ -609,14 +610,14 @@ tensor_guid_t ComputationGraphBuilder::batch_norm(
 
     TensorShape gamma_shape =
         throw_if_unexpected(get_gamma_weights_shape(attrs, input_shape));
-    InitializerAttrs gamma_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}};
+    InitializerAttrs gamma_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(1)}};
     weights.push_back(make_weight_attrs(gamma_shape, gamma_initializer));
 
     TensorShape beta_shape =
         throw_if_unexpected(get_beta_weights_shape(attrs, input_shape));
-    InitializerAttrs beta_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}};
+    InitializerAttrs beta_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(0)}};
     weights.push_back(make_weight_attrs(beta_shape, beta_initializer));
   }
 
@@ -688,8 +689,8 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention(
         get_input_bias_shape(attrs, query_shape, key_shape, value_shape));
     // initializer chosen based on
     // https://github.com/pytorch/pytorch/blob/31c4e0d37d8efc37a0697159e5b9121ec34d5141/torch/nn/modules/activation.py#L1120-L1121
-    InitializerAttrs input_bias_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}};
+    InitializerAttrs input_bias_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(0)}};
 
     weights.push_back(
         make_weight_attrs(input_bias_shape, input_bias_initializer));
@@ -698,8 +699,8 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention(
         get_output_bias_shape(attrs, query_shape, key_shape, value_shape));
     // initializer chosen based on
     // https://github.com/pytorch/pytorch/blob/31c4e0d37d8efc37a0697159e5b9121ec34d5141/torch/nn/modules/activation.py#L1120-L1121
-    InitializerAttrs output_bias_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}};
+    InitializerAttrs output_bias_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(0)}};
 
     weights.push_back(
         make_weight_attrs(output_bias_shape, output_bias_initializer));
@@ -870,14 +871,14 @@ tensor_guid_t ComputationGraphBuilder::layer_norm(
 
     TensorShape gamma_shape =
         throw_if_unexpected(get_gamma_weights_shape(attrs, input_shape));
-    InitializerAttrs gamma_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}};
+    InitializerAttrs gamma_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(1)}};
     weights.push_back(make_weight_attrs(gamma_shape, gamma_initializer));
 
     TensorShape beta_shape =
         throw_if_unexpected(get_beta_weights_shape(attrs, input_shape));
-    InitializerAttrs beta_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}};
+    InitializerAttrs beta_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(0)}};
     weights.push_back(make_weight_attrs(beta_shape, beta_initializer));
   }
 
diff --git a/lib/pcg/src/pcg/metric.cc b/lib/pcg/src/pcg/metric.cc
new file mode 100644
index 0000000000..69aba90d12
--- /dev/null
+++ b/lib/pcg/src/pcg/metric.cc
@@ -0,0 +1,38 @@
+#include "pcg/metric.h"
+
+namespace FlexFlow {
+MetricsAttrs::MetricsAttrs(LossFunction _loss_type,
+                           std::vector<Metric> const &metrics)
+    : loss_type(_loss_type), measure_accuracy(false),
+      measure_categorical_crossentropy(false),
+      measure_sparse_categorical_crossentropy(false),
+      measure_mean_squared_error(false), measure_root_mean_squared_error(false),
+      measure_mean_absolute_error(false) {
+  for (Metric const &m : metrics) {
+    switch (m) {
+      case Metric::ACCURACY:
+        measure_accuracy = true;
+        continue;
+      case Metric::CATEGORICAL_CROSSENTROPY:
+        measure_categorical_crossentropy = true;
+        continue;
+      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
+        measure_sparse_categorical_crossentropy = true;
+        continue;
+      case Metric::MEAN_SQUARED_ERROR:
+        measure_mean_squared_error = true;
+        continue;
+      case Metric::ROOT_MEAN_SQUARED_ERROR:
+        measure_root_mean_squared_error = true;
+        continue;
+      case Metric::MEAN_ABSOLUTE_ERROR:
+        measure_mean_absolute_error = true;
+        continue;
+      default:
+        throw mk_runtime_error(
+            "Initializing MetricsAttrs with unrecogonized metrics type");
+    }
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
index f33b4dcd17..79ac43ae66 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
@@ -1,5 +1,6 @@
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
 #include "op-attrs/get_incoming_tensor_roles.h"
+#include "op-attrs/make_datatype_value.h"
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/ops/batch_matmul.h"
 #include "op-attrs/ops/batch_norm.h"
@@ -385,14 +386,14 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::batch_norm(
 
     ParallelTensorShape gamma_shape =
         throw_if_unexpected(get_gamma_weights_shape(attrs, input_shape));
-    InitializerAttrs gamma_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}};
+    InitializerAttrs gamma_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(1)}};
     weights.push_back(make_weight_attrs(gamma_shape, gamma_initializer));
 
     ParallelTensorShape beta_shape =
         throw_if_unexpected(get_beta_weights_shape(attrs, input_shape));
-    InitializerAttrs beta_initializer =
-        InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}};
+    InitializerAttrs beta_initializer = InitializerAttrs{
+        ConstantInitializerAttrs{make_float_data_type_value(0)}};
     weights.push_back(make_weight_attrs(beta_shape, beta_initializer));
   }
 
diff --git a/lib/runtime/src/metrics_functions.cc b/lib/runtime/src/metrics_functions.cc
index feb6e704b2..33e15baed2 100644
--- a/lib/runtime/src/metrics_functions.cc
+++ b/lib/runtime/src/metrics_functions.cc
@@ -25,39 +25,6 @@ namespace FlexFlow {
 
 LegionRuntime::Logger::Category log_metrics("metrics");
 
-MetricsAttrs::MetricsAttrs(LossFunction _loss_type,
-                           std::vector<Metric> const &metrics)
-    : loss_type(_loss_type), measure_accuracy(false),
-      measure_categorical_crossentropy(false),
-      measure_sparse_categorical_crossentropy(false),
-      measure_mean_squared_error(false), measure_root_mean_squared_error(false),
-      measure_mean_absolute_error(false) {
-  for (Metric const &m : metrics) {
-    switch (m) {
-      case Metric::ACCURACY:
-        measure_accuracy = true;
-        continue;
-      case Metric::CATEGORICAL_CROSSENTROPY:
-        measure_categorical_crossentropy = true;
-        continue;
-      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
-        measure_sparse_categorical_crossentropy = true;
-        continue;
-      case Metric::MEAN_SQUARED_ERROR:
-        measure_mean_squared_error = true;
-        continue;
-      case Metric::ROOT_MEAN_SQUARED_ERROR:
-        measure_root_mean_squared_error = true;
-        continue;
-      case Metric::MEAN_ABSOLUTE_ERROR:
-        measure_mean_absolute_error = true;
-        continue;
-      default:
-        throw mk_runtime_error("Unrecogonized metrics type {}", m);
-    }
-  }
-}
-
 enum Slots {
   LOGIT,
   LABEL,
diff --git a/lib/runtime/src/metrics_functions.h b/lib/runtime/src/metrics_functions.h
index fbb0b633bf..73dc3bbc51 100644
--- a/lib/runtime/src/metrics_functions.h
+++ b/lib/runtime/src/metrics_functions.h
@@ -16,38 +16,13 @@
 #ifndef _FF_METRICS_FUNCTIONS_H_
 #define _FF_METRICS_FUNCTIONS_H_
 
+#include "kernels/metric.h"
 #include "kernels/perf_metrics.h"
 #include "legion.h"
-#include "op-attrs/ops/loss_functions.h"
 #include "task_spec/task_invocation.h"
-#include "utils/fmt.h"
 
 namespace FlexFlow {
 
-enum class Metric {
-  ACCURACY,
-  CATEGORICAL_CROSSENTROPY,
-  SPARSE_CATEGORICAL_CROSSENTROPY,
-  MEAN_SQUARED_ERROR,
-  ROOT_MEAN_SQUARED_ERROR,
-  MEAN_ABSOLUTE_ERROR,
-};
-
-class MetricsAttrs {
-public:
-  MetricsAttrs() = delete;
-  MetricsAttrs(LossFunction, std::vector<Metric> const &);
-
-public:
-  LossFunction loss_type;
-  bool measure_accuracy;
-  bool measure_categorical_crossentropy;
-  bool measure_sparse_categorical_crossentropy;
-  bool measure_mean_squared_error;
-  bool measure_root_mean_squared_error;
-  bool measure_mean_absolute_error;
-};
-
 TypedIndexTaskInvocation<PerfMetrics>
     compute_metrics(MetricsAttrs const &,
                     parallel_tensor_guid_t const &logit,
@@ -79,40 +54,4 @@ VISITABLE_STRUCT(::FlexFlow::MetricsAttrs,
                  measure_root_mean_squared_error,
                  measure_mean_absolute_error);
 
-namespace fmt {
-
-template <>
-struct formatter<::FlexFlow::Metric> : formatter<string_view> {
-  template <typename FormatContext>
-  auto format(::FlexFlow::Metric m, FormatContext &ctx) const
-      -> decltype(ctx.out()) {
-    using namespace FlexFlow;
-
-    string_view name = "unknown";
-    switch (m) {
-      case Metric::ACCURACY:
-        name = "Accuracy";
-        break;
-      case Metric::CATEGORICAL_CROSSENTROPY:
-        name = "CategoricalCrossEntropy";
-        break;
-      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
-        name = "SparseCategoricalCrossEntropy";
-        break;
-      case Metric::MEAN_SQUARED_ERROR:
-        name = "MeanSquaredError";
-        break;
-      case Metric::ROOT_MEAN_SQUARED_ERROR:
-        name = "RootMeanSquaredError";
-        break;
-      case Metric::MEAN_ABSOLUTE_ERROR:
-        name = "MeanAbsoluteError";
-        break;
-    }
-    return formatter<string_view>::format(name, ctx);
-  }
-};
-
-} // namespace fmt
-
 #endif
diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc
index 2370739d58..f34751ef8d 100644
--- a/lib/runtime/src/ops/embedding.cc
+++ b/lib/runtime/src/ops/embedding.cc
@@ -77,15 +77,15 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[Embedding] backward_time = {:.2lf}ms\n",
-                 input,
                  output,
+                 input,
                  weight_grad,
-                 input.data_type,
                  output.data_type,
+                 input.data_type,
                  attrs.aggr,
                  input.shape.get_dim(),
                  output.shape.get_dim(),
-                 input.shape.at(ff_dim_t(0)));
+                 input.shape.at(ff_dim_t{0}));
 }
 
 TaskImplFunction get_embedding_fwd_task_impl() {