flexflow · oOTigger · Jul 11, 2024 · Jul 12, 2024 · Jul 14, 2024 · Jul 14, 2024
diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
@@ -7,8 +7,7 @@ file(GLOB_RECURSE SRC
      CONFIGURE_DEPENDS
      LIST_DIRECTORIES False
      src/*.cc
-     src/cuda/cuda_helper.cu
-     src/cuda/ops/*.cu
+     src/cuda/*.cu
      )
 
 add_library(
@@ -30,6 +29,7 @@ target_link_libraries(
   cudnn
   nccl
   utils
+  pcg
 )
 
 define_ff_vars(${project_target})

diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
@@ -5,11 +5,95 @@
 #include "device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/datatype.h"
+#include "pcg/device_type.dtg.h"
 #include "utils/exception.h"
 #include "utils/required.h"
 
 namespace FlexFlow {
 
+class GenericTensorAccessorR {
+public:
+  template <DataType DT>
+  typename data_type_enum_to_class<DT>::type const *get() const {
+    if (this->data_type == DT) {
+      return static_cast<real_type_t<DT> const *>(this->ptr);
+    } else {
+      throw mk_runtime_error(fmt::format(
+          "Invalid access data type ({} != {})", this->data_type, DT));
+    }
+  }
+
+  int32_t const *get_int32_ptr() const;
+  int64_t const *get_int64_ptr() const;
+  float const *get_float_ptr() const;
+  double const *get_double_ptr() const;
+  half const *get_half_ptr() const;
+
+  GenericTensorAccessorR() = delete;
+
+  GenericTensorAccessorR(DataType data_type,
+                         ArrayShape const &shape,
+                         void const *ptr,
+                         DeviceType device_type);
+
+  bool operator==(GenericTensorAccessorR const &) const;
+  bool operator!=(GenericTensorAccessorR const &) const;
+
+  template <DataType DT>
+  real_type_t<DT> const &at(std::vector<int> const &indices) const {
+    if (this->device_type != DeviceType::CPU) {
+      throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
+    }
+    if (this->data_type != DT) {
+      throw mk_runtime_error(fmt::format(
+          "Invalid access data type ({} != {})", this->data_type, DT));
+    }
+    if (indices.size() != this->shape.num_dims()) {
+      throw mk_runtime_error(fmt::format("Number of indices ({}) does not "
+                                         "match the number of dimensions ({}).",
+                                         indices.size(),
+                                         this->shape.num_dims()));
+    }
+
+    using T = real_type_t<DT>;
+    T const *data_ptr = static_cast<T const *>(this->ptr);
+
+    int offset = 0;
+    int multiplier = 1;
+    for (int i = 0; i < this->shape.num_dims(); i++) {
+      if (indices.at(i) >= this->shape.at(legion_dim_t{i})) {
+        throw mk_runtime_error(
+            fmt::format("In {} dimension, attempting to access index {} "
+                        "when only {} indexes exist",
+                        i,
+                        indices.at(i),
+                        this->shape.at(legion_dim_t{i})));
+      }
+
+      offset += indices.at(i) * multiplier;
+      multiplier *= this->shape.at(legion_dim_t{i});
+    }
+
+    return data_ptr[offset];
+  }
+
+public:
+  DataType data_type;
+  ArrayShape shape;
+  void const *ptr;
+  DeviceType device_type;
+
+private:
+  std::tuple<decltype(data_type) const &,
+             decltype(shape) const &,
+             decltype(ptr) const &,
+             decltype(device_type) const &>
+      tie() const;
+};
+
+std::string format_as(GenericTensorAccessorR const &);
+std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
+
 class GenericTensorAccessorW {
 public:
   template <DataType DT>
@@ -28,64 +112,110 @@ class GenericTensorAccessorW {
   double *get_double_ptr() const;
   half *get_half_ptr() const;
 
-public:
-  DataType data_type;
-  ArrayShape shape;
-  req<void *> ptr;
-};
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW,
-                                             data_type,
-                                             shape,
-                                             ptr);
+  GenericTensorAccessorW() = delete;
 
-std::string format_as(GenericTensorAccessorW const &);
-std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
+  GenericTensorAccessorW(DataType data_type,
+                         ArrayShape const &shape,
+                         void *ptr,
+                         DeviceType device_type);
+
+  bool operator==(GenericTensorAccessorW const &) const;
+  bool operator!=(GenericTensorAccessorW const &) const;
+
+  operator GenericTensorAccessorR() const;
 
-class GenericTensorAccessorR {
-public:
   template <DataType DT>
-  typename data_type_enum_to_class<DT>::type const *get() const {
-    if (this->data_type == DT) {
-      return static_cast<real_type_t<DT> const *>(this->ptr);
-    } else {
+  real_type_t<DT> &at(std::vector<int> const &indices) {
+    if (this->device_type != DeviceType::CPU) {
+      throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
+    }
+    if (this->data_type != DT) {
       throw mk_runtime_error(fmt::format(
           "Invalid access data type ({} != {})", this->data_type, DT));
     }
+    if (indices.size() != this->shape.num_dims()) {
+      throw mk_runtime_error(fmt::format("Number of indices ({}) does not "
+                                         "match the number of dimensions ({}).",
+                                         indices.size(),
+                                         this->shape.num_dims()));
+    }
+
+    using T = real_type_t<DT>;
+
+    T *data_ptr = static_cast<T *>(this->ptr);
+    int offset = 0;
+    int multiplier = 1;
+    for (int i = 0; i < this->shape.num_dims(); i++) {
+      if (indices.at(i) >= this->shape.at(legion_dim_t{i})) {
+        throw mk_runtime_error(
+            fmt::format("In {} dimension, attempting to access index {} "
+                        "when only {} indexes exist",
+                        i,
+                        indices.at(i),
+                        this->shape.at(legion_dim_t{i})));
+      }
+
+      offset += indices.at(i) * multiplier;
+      multiplier *= this->shape.at(legion_dim_t{i});
+    }
+
+    return data_ptr[offset];
   }
 
-  int32_t const *get_int32_ptr() const;
-  int64_t const *get_int64_ptr() const;
-  float const *get_float_ptr() const;
-  double const *get_double_ptr() const;
-  half const *get_half_ptr() const;
+  template <DataType DT>
+  real_type_t<DT> &at(std::vector<int> const &indices) const {
+    if (this->device_type != DeviceType::CPU) {
+      throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
+    }
+    if (this->data_type != DT) {
+      throw mk_runtime_error(fmt::format(
+          "Invalid access data type ({} != {})", this->data_type, DT));
+    }
+    if (indices.size() != this->shape.num_dims()) {
+      throw mk_runtime_error(fmt::format("Number of indices ({}) does not "
+                                         "match the number of dimensions ({}).",
+                                         indices.size(),
+                                         this->shape.num_dims()));
+    }
+
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = static_cast<T const *>(this->ptr);
+    int offset = 0;
+    int multiplier = 1;
+    for (int i = 0; i < this->shape.num_dims(); i++) {
+      if (indices.at(i) >= this->shape.at(legion_dim_t{i})) {
+        throw mk_runtime_error(
+            fmt::format("In {} dimension, attempting to access index {} "
+                        "when only {} indexes exist",
+                        i,
+                        indices.at(i),
+                        this->shape.at(legion_dim_t{i})));
+      }
+
+      offset += indices.at(i) * multiplier;
+      multiplier *= this->shape.at(legion_dim_t{i});
+    }
+
+    return data_ptr[offset];
+  }
 
 public:
   DataType data_type;
   ArrayShape shape;
-  req<void const *> ptr;
-};
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR,
-                                             data_type,
-                                             shape,
-                                             ptr);
+  void *ptr;
+  DeviceType device_type;
 
-std::string format_as(GenericTensorAccessorR const &);
-std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
+private:
+  std::tuple<decltype(data_type) const &,
+             decltype(shape) const &,
+             decltype(ptr) const &,
+             decltype(device_type) const &>
+      tie() const;
+};
 
-int32_t *get_int32_ptr(GenericTensorAccessorW const &);
-int64_t *get_int64_ptr(GenericTensorAccessorW const &);
-float *get_float_ptr(GenericTensorAccessorW const &);
-double *get_double_ptr(GenericTensorAccessorW const &);
-half *get_half_ptr(GenericTensorAccessorW const &);
-std::vector<int32_t *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<int64_t *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<float *>
-    get_float_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<double *>
-    get_double_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<half *> get_half_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::string format_as(GenericTensorAccessorW const &);
+std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
 
 static_assert(is_fmtable<req<DataType> const &>::value, "");
 
@@ -137,6 +267,21 @@ std::vector<double const *>
 std::vector<half const *>
     get_half_ptrs(std::vector<GenericTensorAccessorR> const &);
 
+int32_t *get_int32_ptr(GenericTensorAccessorW const &);
+int64_t *get_int64_ptr(GenericTensorAccessorW const &);
+float *get_float_ptr(GenericTensorAccessorW const &);
+double *get_double_ptr(GenericTensorAccessorW const &);
+half *get_half_ptr(GenericTensorAccessorW const &);
+std::vector<int32_t *>
+    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<int64_t *>
+    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<float *>
+    get_float_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<double *>
+    get_double_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<half *> get_half_ptrs(std::vector<GenericTensorAccessorW> const &);
+
 template <DataType DT>
 std::vector<real_type_t<DT> const *>
     get(std::vector<GenericTensorAccessorR> const &accs) {
@@ -150,21 +295,18 @@ std::vector<real_type_t<DT> const *>
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &write_accessor);
 
-bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
-                              GenericTensorAccessorW const &acc2);
-
-bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype);
+bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
+                              GenericTensorAccessorR const &acc2);
 
 bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
                              ArrayShape const &expected_shape,
                              DataType const &expected_dtype);
 
 std::pair<ArrayShape, DataType>
     get_shape_and_datatype(GenericTensorAccessorR const &accessor);
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorW const &accessor);
+
+void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
+                                    GenericTensorAccessorR const &src_accessor);
 
 } // namespace FlexFlow
 

diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_ALLOCATION_H
 #define _FLEXFLOW_KERNELS_ALLOCATION_H
 
-#include "accessor.h"
+#include "kernels/accessor.h"
 #include <cstddef>
 #include <memory>
 
@@ -11,16 +11,21 @@ struct IAllocator {
   virtual void *allocate(size_t) = 0;
   virtual void deallocate(void *) = 0;
 
+  virtual DeviceType get_allocation_device_type() const = 0;
+
   virtual ~IAllocator() = default;
 };
 
 struct Allocator {
   Allocator() = delete;
 
   GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape);
+
   void *allocate(size_t mem_size);
   void deallocate(void *ptr);
 
+  DeviceType get_allocation_device_type() const;
+
   template <typename T, typename... Args>
   static typename std::enable_if<std::is_base_of<IAllocator, T>::value,
                                  Allocator>::type

diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h
@@ -64,8 +64,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState,
 std::string format_as(MHAPerDeviceState const &x);
 std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x);
 
-namespace Kernels {
-namespace MultiHeadAttention {
+namespace Kernels::MultiHeadAttention {
 
 MHAPerDeviceState init_kernel(PerDeviceFFHandle const &,
                               Allocator &,
@@ -105,8 +104,7 @@ void backward_kernel(ffStream_t stream,
 void cleanup_kernel(Allocator &allocator,
                     MHAPerDeviceState const &device_state);
 
-} // namespace MultiHeadAttention
-} // namespace Kernels
+} // namespace Kernels::MultiHeadAttention
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h
@@ -5,9 +5,7 @@
 #include "kernels/allocation.h"
 #include "kernels/ff_handle.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace BatchMatmul {
+namespace FlexFlow::Kernels::BatchMatmul {
 
 void forward_kernel(ffStream_t stream,
                     PerDeviceFFHandle const &handle,
@@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream,
                      int k,
                      int batch);
 
-} // namespace BatchMatmul
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::BatchMatmul
 
 #endif