diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index 8ccd7c1011..f5d88f102f 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -7,8 +7,7 @@ file(GLOB_RECURSE SRC CONFIGURE_DEPENDS LIST_DIRECTORIES False src/*.cc - src/cuda/cuda_helper.cu - src/cuda/ops/*.cu + src/cuda/*.cu ) add_library( @@ -30,6 +29,7 @@ target_link_libraries( cudnn nccl utils + pcg ) define_ff_vars(${project_target}) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 39da65c3be..487bc1f8f0 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -5,11 +5,95 @@ #include "device.h" #include "kernels/ff_handle.h" #include "op-attrs/datatype.h" +#include "pcg/device_type.dtg.h" #include "utils/exception.h" #include "utils/required.h" namespace FlexFlow { +class GenericTensorAccessorR { +public: + template + typename data_type_enum_to_class
::type const *get() const { + if (this->data_type == DT) { + return static_cast const *>(this->ptr); + } else { + throw mk_runtime_error(fmt::format( + "Invalid access data type ({} != {})", this->data_type, DT)); + } + } + + int32_t const *get_int32_ptr() const; + int64_t const *get_int64_ptr() const; + float const *get_float_ptr() const; + double const *get_double_ptr() const; + half const *get_half_ptr() const; + + GenericTensorAccessorR() = delete; + + GenericTensorAccessorR(DataType data_type, + ArrayShape const &shape, + void const *ptr, + DeviceType device_type); + + bool operator==(GenericTensorAccessorR const &) const; + bool operator!=(GenericTensorAccessorR const &) const; + + template + real_type_t
const &at(std::vector const &indices) const { + if (this->device_type != DeviceType::CPU) { + throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); + } + if (this->data_type != DT) { + throw mk_runtime_error(fmt::format( + "Invalid access data type ({} != {})", this->data_type, DT)); + } + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error(fmt::format("Number of indices ({}) does not " + "match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims())); + } + + using T = real_type_t
; + T const *data_ptr = static_cast(this->ptr); + + int offset = 0; + int multiplier = 1; + for (int i = 0; i < this->shape.num_dims(); i++) { + if (indices.at(i) >= this->shape.at(legion_dim_t{i})) { + throw mk_runtime_error( + fmt::format("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + indices.at(i), + this->shape.at(legion_dim_t{i}))); + } + + offset += indices.at(i) * multiplier; + multiplier *= this->shape.at(legion_dim_t{i}); + } + + return data_ptr[offset]; + } + +public: + DataType data_type; + ArrayShape shape; + void const *ptr; + DeviceType device_type; + +private: + std::tuple + tie() const; +}; + +std::string format_as(GenericTensorAccessorR const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); + class GenericTensorAccessorW { public: template @@ -28,64 +112,110 @@ class GenericTensorAccessorW { double *get_double_ptr() const; half *get_half_ptr() const; -public: - DataType data_type; - ArrayShape shape; - req ptr; -}; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW, - data_type, - shape, - ptr); + GenericTensorAccessorW() = delete; -std::string format_as(GenericTensorAccessorW const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); + GenericTensorAccessorW(DataType data_type, + ArrayShape const &shape, + void *ptr, + DeviceType device_type); + + bool operator==(GenericTensorAccessorW const &) const; + bool operator!=(GenericTensorAccessorW const &) const; + + operator GenericTensorAccessorR() const; -class GenericTensorAccessorR { -public: template - typename data_type_enum_to_class
::type const *get() const { - if (this->data_type == DT) { - return static_cast const *>(this->ptr); - } else { + real_type_t
&at(std::vector const &indices) { + if (this->device_type != DeviceType::CPU) { + throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); + } + if (this->data_type != DT) { throw mk_runtime_error(fmt::format( "Invalid access data type ({} != {})", this->data_type, DT)); } + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error(fmt::format("Number of indices ({}) does not " + "match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims())); + } + + using T = real_type_t
; + + T *data_ptr = static_cast(this->ptr); + int offset = 0; + int multiplier = 1; + for (int i = 0; i < this->shape.num_dims(); i++) { + if (indices.at(i) >= this->shape.at(legion_dim_t{i})) { + throw mk_runtime_error( + fmt::format("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + indices.at(i), + this->shape.at(legion_dim_t{i}))); + } + + offset += indices.at(i) * multiplier; + multiplier *= this->shape.at(legion_dim_t{i}); + } + + return data_ptr[offset]; } - int32_t const *get_int32_ptr() const; - int64_t const *get_int64_ptr() const; - float const *get_float_ptr() const; - double const *get_double_ptr() const; - half const *get_half_ptr() const; + template + real_type_t
&at(std::vector const &indices) const { + if (this->device_type != DeviceType::CPU) { + throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); + } + if (this->data_type != DT) { + throw mk_runtime_error(fmt::format( + "Invalid access data type ({} != {})", this->data_type, DT)); + } + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error(fmt::format("Number of indices ({}) does not " + "match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims())); + } + + using T = real_type_t
; + + T const *data_ptr = static_cast(this->ptr); + int offset = 0; + int multiplier = 1; + for (int i = 0; i < this->shape.num_dims(); i++) { + if (indices.at(i) >= this->shape.at(legion_dim_t{i})) { + throw mk_runtime_error( + fmt::format("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + indices.at(i), + this->shape.at(legion_dim_t{i}))); + } + + offset += indices.at(i) * multiplier; + multiplier *= this->shape.at(legion_dim_t{i}); + } + + return data_ptr[offset]; + } public: DataType data_type; ArrayShape shape; - req ptr; -}; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR, - data_type, - shape, - ptr); + void *ptr; + DeviceType device_type; -std::string format_as(GenericTensorAccessorR const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); +private: + std::tuple + tie() const; +}; -int32_t *get_int32_ptr(GenericTensorAccessorW const &); -int64_t *get_int64_ptr(GenericTensorAccessorW const &); -float *get_float_ptr(GenericTensorAccessorW const &); -double *get_double_ptr(GenericTensorAccessorW const &); -half *get_half_ptr(GenericTensorAccessorW const &); -std::vector - get_int32_ptrs(std::vector const &); -std::vector - get_int64_ptrs(std::vector const &); -std::vector - get_float_ptrs(std::vector const &); -std::vector - get_double_ptrs(std::vector const &); -std::vector get_half_ptrs(std::vector const &); +std::string format_as(GenericTensorAccessorW const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); static_assert(is_fmtable const &>::value, ""); @@ -137,6 +267,21 @@ std::vector std::vector get_half_ptrs(std::vector const &); +int32_t *get_int32_ptr(GenericTensorAccessorW const &); +int64_t *get_int64_ptr(GenericTensorAccessorW const &); +float *get_float_ptr(GenericTensorAccessorW const &); +double *get_double_ptr(GenericTensorAccessorW const &); +half *get_half_ptr(GenericTensorAccessorW const &); +std::vector + get_int32_ptrs(std::vector const &); +std::vector + get_int64_ptrs(std::vector const &); +std::vector + get_float_ptrs(std::vector const &); +std::vector + get_double_ptrs(std::vector const &); +std::vector get_half_ptrs(std::vector const &); + template std::vector const *> get(std::vector const &accs) { @@ -150,12 +295,8 @@ std::vector const *> GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &write_accessor); -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2); - -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype); +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2); bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, ArrayShape const &expected_shape, @@ -163,8 +304,9 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, std::pair get_shape_and_datatype(GenericTensorAccessorR const &accessor); -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor); + +void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h index 6500899394..4bf97118ce 100644 --- a/lib/kernels/include/kernels/allocation.h +++ b/lib/kernels/include/kernels/allocation.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_ALLOCATION_H #define _FLEXFLOW_KERNELS_ALLOCATION_H -#include "accessor.h" +#include "kernels/accessor.h" #include #include @@ -11,6 +11,8 @@ struct IAllocator { virtual void *allocate(size_t) = 0; virtual void deallocate(void *) = 0; + virtual DeviceType get_allocation_device_type() const = 0; + virtual ~IAllocator() = default; }; @@ -18,9 +20,12 @@ struct Allocator { Allocator() = delete; GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape); + void *allocate(size_t mem_size); void deallocate(void *ptr); + DeviceType get_allocation_device_type() const; + template static typename std::enable_if::value, Allocator>::type diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h index eb5a1b8198..1e483102dd 100644 --- a/lib/kernels/include/kernels/attention_kernels.h +++ b/lib/kernels/include/kernels/attention_kernels.h @@ -64,8 +64,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState, std::string format_as(MHAPerDeviceState const &x); std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x); -namespace Kernels { -namespace MultiHeadAttention { +namespace Kernels::MultiHeadAttention { MHAPerDeviceState init_kernel(PerDeviceFFHandle const &, Allocator &, @@ -105,8 +104,7 @@ void backward_kernel(ffStream_t stream, void cleanup_kernel(Allocator &allocator, MHAPerDeviceState const &device_state); -} // namespace MultiHeadAttention -} // namespace Kernels +} // namespace Kernels::MultiHeadAttention } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h index bfd72647b0..bde91bea15 100644 --- a/lib/kernels/include/kernels/batch_matmul_kernels.h +++ b/lib/kernels/include/kernels/batch_matmul_kernels.h @@ -5,9 +5,7 @@ #include "kernels/allocation.h" #include "kernels/ff_handle.h" -namespace FlexFlow { -namespace Kernels { -namespace BatchMatmul { +namespace FlexFlow::Kernels::BatchMatmul { void forward_kernel(ffStream_t stream, PerDeviceFFHandle const &handle, @@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream, int k, int batch); -} // namespace BatchMatmul -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::BatchMatmul #endif diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h index 7d533d672c..3fea92c86b 100644 --- a/lib/kernels/include/kernels/batch_norm_kernels.h +++ b/lib/kernels/include/kernels/batch_norm_kernels.h @@ -43,8 +43,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(BatchNormPerDeviceState, output_w, relu); -namespace Kernels { -namespace BatchNorm { +namespace Kernels::BatchNorm { BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle, Allocator allocator, @@ -64,9 +63,9 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, BatchNormPerDeviceState const &m, - float const *input_ptr, - float *output_grad_ptr, float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, float *input_grad_ptr, float const *scale_ptr, float *scale_grad_ptr, @@ -81,8 +80,7 @@ void cleanup_kernel(Allocator allocator, bool relu, float *runningMean); -} // namespace BatchNorm -} // namespace Kernels +} // namespace Kernels::BatchNorm } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h index 96f9aadd52..da13e0036d 100644 --- a/lib/kernels/include/kernels/cast_kernels.h +++ b/lib/kernels/include/kernels/cast_kernels.h @@ -3,27 +3,17 @@ #include "device.h" #include "kernels/accessor.h" -#include "kernels/ff_handle.h" -#include "op-attrs/activation.dtg.h" -namespace FlexFlow { -namespace Kernels { -namespace Cast { +namespace FlexFlow::Kernels::Cast { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); -} // namespace Cast -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Cast #endif diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h new file mode 100644 index 0000000000..a5df80d4da --- /dev/null +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H + +#include "device.h" +#include "kernels/accessor.h" + +namespace FlexFlow::Kernels::Cast { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); + +} // namespace FlexFlow::Kernels::Cast + +#endif diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h index eb263e0734..50de18e823 100644 --- a/lib/kernels/include/kernels/combine_kernels.h +++ b/lib/kernels/include/kernels/combine_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Combine { +namespace FlexFlow::Kernels::Combine { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad); -} // namespace Combine -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Combine #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h new file mode 100644 index 0000000000..430c7cf906 --- /dev/null +++ b/lib/kernels/include/kernels/combine_kernels_cpu.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H + +#include "device.h" +#include "kernels/accessor.h" + +namespace FlexFlow::Kernels::Combine { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); + +} // namespace FlexFlow::Kernels::Combine + +#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h index a44affc1f2..33355296dd 100644 --- a/lib/kernels/include/kernels/concat_kernels.h +++ b/lib/kernels/include/kernels/concat_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Concat { +namespace FlexFlow::Kernels::Concat { void forward_kernel(ffStream_t stream, GenericTensorAccessorW const &output, @@ -18,8 +16,6 @@ void backward_kernel(ffStream_t stream, std::vector const &input_grads, ff_dim_t axis); -} // namespace Concat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Concat #endif diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index cfc64f963d..f49c8f50f4 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -34,8 +34,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState, bwdFilterAlgo, bwdDataAlgo); -namespace Kernels { -namespace Conv2D { +namespace Kernels::Conv2D { Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, std::optional activation, @@ -61,17 +60,16 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *filter_ptr, float *filter_grad_ptr, float *bias_grad_ptr, std::optional activation); -} // namespace Conv2D -} // namespace Kernels +} // namespace Kernels::Conv2D } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h new file mode 100644 index 0000000000..da8af71e4f --- /dev/null +++ b/lib/kernels/include/kernels/copy_tensor_accessor.h @@ -0,0 +1,19 @@ +#ifndef _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H +#define _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H + +#include "kernels/accessor.h" +#include "kernels/allocation.h" + +namespace FlexFlow { + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator); + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h index e83fc3325d..50ca66a820 100644 --- a/lib/kernels/include/kernels/datatype_dispatch.h +++ b/lib/kernels/include/kernels/datatype_dispatch.h @@ -1,7 +1,8 @@ #ifndef _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H #define _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H -#include "accessor.h" +#include "op-attrs/datatype.h" +#include "utils/exception.h" namespace FlexFlow { @@ -33,7 +34,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(Args... args) const { + Out operator()(Args &&...args) const { return F
{}(std::forward(args)...); } }; @@ -41,7 +42,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(DataType data_type, Args... args) { + Out operator()(DataType data_type, Args &&...args) { return dispatch(data_type, std::forward(args)...); } }; @@ -54,13 +55,13 @@ struct DataTypeDispatch2 { template struct OutputType { template - void operator()(Args... args) const { + void operator()(Args &&...args) const { F{}(std::forward(args)...); } }; template - void operator()(DataType output_type, Args... args) const { + void operator()(DataType output_type, Args &&...args) const { dispatch(output_type, std::forward(args)...); } }; @@ -68,7 +69,7 @@ struct DataTypeDispatch2 { template void operator()(DataType input_data_type, DataType output_data_type, - Args... args) { + Args &&...args) { dispatch( input_data_type, output_data_type, std::forward(args)...); } diff --git a/lib/kernels/include/kernels/dropout_kernels.h b/lib/kernels/include/kernels/dropout_kernels.h index c0e503be5b..4790540098 100644 --- a/lib/kernels/include/kernels/dropout_kernels.h +++ b/lib/kernels/include/kernels/dropout_kernels.h @@ -31,8 +31,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(DropoutPerDeviceState, reserveSpaceSize, dropoutStateSize); -namespace Kernels { -namespace Dropout { +namespace Kernels::Dropout { DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle, float rate, @@ -56,8 +55,7 @@ void cleanup_kernel(Allocator allocator, ffDropoutDescriptor_t dropoutDesc, void *dropoutStates); -} // namespace Dropout -} // namespace Kernels +} // namespace Kernels::Dropout } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H diff --git a/lib/kernels/include/kernels/element_binary_kernels.h b/lib/kernels/include/kernels/element_binary_kernels.h index 41447e98e6..1017230fb0 100644 --- a/lib/kernels/include/kernels/element_binary_kernels.h +++ b/lib/kernels/include/kernels/element_binary_kernels.h @@ -26,8 +26,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ElementBinaryPerDeviceState, opDesc, reduceAddDesc); -namespace Kernels { -namespace ElementBinary { +namespace Kernels::ElementBinary { ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle, OperatorType op_type, @@ -58,8 +57,7 @@ void backward_kernel(ffStream_t stream, bool broadcast_inputRHS, PerDeviceFFHandle handle); -} // namespace ElementBinary -} // namespace Kernels +} // namespace Kernels::ElementBinary } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h index 8c6864b2d9..c338f465ac 100644 --- a/lib/kernels/include/kernels/element_unary_kernels.h +++ b/lib/kernels/include/kernels/element_unary_kernels.h @@ -19,8 +19,7 @@ FF_VISITABLE_STRUCT_NO_EQ(ElementUnaryPerDeviceState, outputTensor, actiDesc); -namespace Kernels { -namespace ElementUnary { +namespace Kernels::ElementUnary { ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape, ArrayShape const &output_shape, @@ -37,13 +36,12 @@ void backward_kernel(ffStream_t stream, ElementUnaryPerDeviceState const &device_state, ElementUnaryAttrs const &attrs, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad); -} // namespace ElementUnary -} // namespace Kernels +} // namespace Kernels::ElementUnary } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h index 06582ca1d5..f5b2561b56 100644 --- a/lib/kernels/include/kernels/embedding_kernels.h +++ b/lib/kernels/include/kernels/embedding_kernels.h @@ -5,9 +5,7 @@ #include "kernels/accessor.h" #include "op-attrs/ops/embedding.h" -namespace FlexFlow { -namespace Kernels { -namespace Embedding { +namespace FlexFlow::Kernels::Embedding { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, @@ -19,11 +17,11 @@ void forward_kernel(ffStream_t stream, int out_dim, int batch_size); void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, - DataType input_data_type, DataType output_data_type, + DataType input_data_type, std::optional aggr, int in_dim, int out_dim, @@ -35,8 +33,6 @@ void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p); template __global__ void rand_generate_int(TD *ptr, size_t size, TD p); -} // namespace Embedding -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Embedding #endif // _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h index 3e600c48de..d60a1a5157 100644 --- a/lib/kernels/include/kernels/flat_kernels.h +++ b/lib/kernels/include/kernels/flat_kernels.h @@ -4,20 +4,17 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Flat { +namespace FlexFlow::Kernels::Flat { void forward_kernel(ffStream_t stream, GenericTensorAccessorR input, float *output_ptr); -void backward_kernel(ffStream_t stream, + +void backward_kernel(cudaStream_t stream, GenericTensorAccessorR input, - float *input_grad_ptr, - float const *output_grad_ptr); + float const *output_grad_ptr, + float *input_grad_ptr); -} // namespace Flat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Flat #endif // _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h index 13bf4b898a..af2da3b11f 100644 --- a/lib/kernels/include/kernels/gather_kernels.h +++ b/lib/kernels/include/kernels/gather_kernels.h @@ -15,8 +15,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState, handle, legion_dim); -namespace Kernels { -namespace Gather { +namespace Kernels::Gather { void forward_kernel(ffStream_t stream, GatherPerDeviceState const &m, @@ -30,8 +29,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &index, GenericTensorAccessorW const &input_grad); -} // namespace Gather -} // namespace Kernels +} // namespace Kernels::Gather } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h index be13d32879..a6ae87442a 100644 --- a/lib/kernels/include/kernels/layer_norm_kernels.h +++ b/lib/kernels/include/kernels/layer_norm_kernels.h @@ -30,8 +30,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState, bias, data_type); -namespace Kernels { -namespace LayerNorm { +namespace Kernels::LayerNorm { // todo: this may have some problem. LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle, @@ -57,8 +56,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorW const &gamma_grad, GenericTensorAccessorW const &beta_grad); -} // namespace LayerNorm -} // namespace Kernels +} // namespace Kernels::LayerNorm } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h index 3128e39fd0..cd581b0a25 100644 --- a/lib/kernels/include/kernels/linear_kernels.h +++ b/lib/kernels/include/kernels/linear_kernels.h @@ -33,8 +33,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LinearPerDeviceState, weight_type, output_type); -namespace Kernels { -namespace Linear { +namespace Kernels::Linear { LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, float *one_ptr, @@ -51,29 +50,28 @@ bool use_activation(Activation activation); void forward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *output_ptr, - void const *filter_ptr, - void const *bias_ptr, + float const *input_ptr, + float *output_ptr, + float const *filter_ptr, + float const *bias_ptr, int in_dim, int out_dim, int batch_size); void backward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, - void const *output_ptr, - void *output_grad_ptr, - void const *kernel_ptr, - void *kernel_grad_ptr, - void *bias_ptr, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_ptr, int in_dim, int out_dim, int batch_size); -} // namespace Linear -} // namespace Kernels +} // namespace Kernels::Linear } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h new file mode 100644 index 0000000000..cf6cfe35d1 --- /dev/null +++ b/lib/kernels/include/kernels/local_cpu_allocator.h @@ -0,0 +1,24 @@ +#include "kernels/allocation.h" +#include + +namespace FlexFlow { + +struct LocalCPUAllocator : public IAllocator { + LocalCPUAllocator() = default; + LocalCPUAllocator(LocalCPUAllocator const &) = delete; + LocalCPUAllocator(LocalCPUAllocator &&) = delete; + ~LocalCPUAllocator() = default; + + void *allocate(size_t) override; + void deallocate(void *) override; + + DeviceType get_allocation_device_type() const override; + +private: + std::unordered_map> ptrs; +}; +CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator); + +Allocator create_local_cpu_memory_allocator(); + +} // namespace FlexFlow diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h index 18a4b6e78a..b8e0540974 100644 --- a/lib/kernels/include/kernels/local_cuda_allocator.h +++ b/lib/kernels/include/kernels/local_cuda_allocator.h @@ -12,6 +12,8 @@ struct LocalCudaAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: std::unordered_set ptrs; }; diff --git a/lib/kernels/include/kernels/loss_function_kernels.h b/lib/kernels/include/kernels/loss_function_kernels.h index bab404f884..9e0dbd4ba1 100644 --- a/lib/kernels/include/kernels/loss_function_kernels.h +++ b/lib/kernels/include/kernels/loss_function_kernels.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H -#include "kernels/device.h" +#include "device.h" namespace FlexFlow { diff --git a/lib/kernels/include/kernels/managed_ff_stream.h b/lib/kernels/include/kernels/managed_ff_stream.h index 2f690b2eb3..26d5fb4911 100644 --- a/lib/kernels/include/kernels/managed_ff_stream.h +++ b/lib/kernels/include/kernels/managed_ff_stream.h @@ -19,6 +19,8 @@ struct ManagedFFStream { ffStream_t const &raw_stream() const; + void cleanup(); + private: ffStream_t *stream; }; diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index 0a83a5eecb..035ea574de 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -7,7 +7,10 @@ namespace FlexFlow { struct ManagedPerDeviceFFHandle { public: - ManagedPerDeviceFFHandle(); + ManagedPerDeviceFFHandle() = delete; + + ManagedPerDeviceFFHandle(size_t workSpaceSize, + bool allowTensorOpMathConversion); ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete; ManagedPerDeviceFFHandle & @@ -21,6 +24,8 @@ struct ManagedPerDeviceFFHandle { PerDeviceFFHandle const &raw_handle() const; + void cleanup(); + private: PerDeviceFFHandle *handle; }; diff --git a/lib/kernels/include/kernels/metrics_kernels.h b/lib/kernels/include/kernels/metrics_kernels.h index e4660808b9..d961ee7503 100644 --- a/lib/kernels/include/kernels/metrics_kernels.h +++ b/lib/kernels/include/kernels/metrics_kernels.h @@ -1,25 +1,24 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H -#include "perf_metrics.h" +#include "kernels/perf_metrics.h" +#include "pcg/metric.h" namespace FlexFlow { -void update_metrics_sparse_label_kernel(ffStream_t, - MetricsAttrs const &, - float const *logit_ptr, - int const *label_ptr, - int num_samples, - int num_classes, - PerfMetrics &perf_zc); -void update_metrics_label_kernel(ffStream_t, - MetricsAttrs const &, - float const *logit_ptr, - float const *label_ptr, - int num_samples, - int num_classes, - PerfMetrics &perf_zc); +void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, + int const *label_ptr, + MetricsAttrs const *me, + int num_effective_samples, + int num_classes, + PerfMetrics &perf_zc); +void update_metrics_label_kernel_wrapper(float const *logit_ptr, + float const *label_ptr, + MetricsAttrs const *me, + int num_samples, + int num_classes, + PerfMetrics &perf_zc); } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/nccl.h b/lib/kernels/include/kernels/nccl.h index b8a6784676..042911d172 100644 --- a/lib/kernels/include/kernels/nccl.h +++ b/lib/kernels/include/kernels/nccl.h @@ -23,15 +23,11 @@ struct ncclUniqueId {}; struct ncclComm_t {}; #endif -namespace FlexFlow { -namespace Kernels { -namespace NCCL { +namespace FlexFlow::Kernels::NCCL { ncclUniqueId generate_unique_id(); ncclComm_t create_comm(ncclUniqueId const &, int num_ranks, int my_rank); -} // namespace NCCL -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::NCCL #endif diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h index 9ca6bf8e2b..3b5d292a5f 100644 --- a/lib/kernels/include/kernels/optimizer_kernels.h +++ b/lib/kernels/include/kernels/optimizer_kernels.h @@ -2,53 +2,91 @@ #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H #include "device.h" +#include "kernels/ff_handle.h" +#include "kernels/nccl.h" +#include "kernels/per_device_op_state.dtg.h" namespace FlexFlow { -void sgd_ps_update_task_gpu(ffStream_t, - float lr, - float momentum, - bool nesterov, +__global__ void sgd_update(size_t count, + float lr, + float weight_decay, + float momentum, + bool nesterov, + float const *WGrad, + float *V, + float *W); + +class SGDOptimizer { +public: + static __host__ void ps_update_task_gpu(SGDOptimizer const *op, + float const *w_grad_ptr, + size_t size, + int num_replicas, + float *w_ptr, + float *v_ptr); + +#ifdef FF_USE_NCCL + static __host__ void nccl_update_task_gpu(SGDOptimizer const *op, + PerDeviceOpState const *meta, + float const *w_grad_ptr, + size_t size, + float *w_ptr, + float *v_ptr); +#endif + +public: + float lr; + float weight_decay; + float momentum; + bool nesterov; +}; + +__global__ void + add_kernel(int count, float scale, float const *src, float *dst); + +__global__ void scale_kernel(int count, float a, float b, float *ptr); + +__global__ void adam_update(int count, + float alpha_t, + float beta1, + float beta2, float weight_decay, - float const *weight_grad_ptr, - size_t size, - int num_replicas, - float *weight_ptr, - float *sgd_v_ptr); - -void sgd_nccl_update_task_gpu(ffStream_t, - float lr, - float momentum, - bool nesterov, - float weight_decay PerDeviceFFHandle const &, - float const *weight_grad_ptr, - size_t size, - float *weight_ptr, - float *sgd_v_ptr); - -void adam_ps_update_task_gpu(ffStream_t, - float alpha_t, - float beta1, - float beta2, - float weight_decay, - float epsilon, - float const *weight_grad_ptr, - float *adam_m_ptr, - float *adam_v_ptr, - float *weight_ptr); - -void adam_nccl_update_task_gpu(ffStream_t, - float alpha_t, - float beta1, - float beta2, - float weight_decay, - float epsilon, - PerDeviceFFHandle const &, - float const *weight_grad_ptr, - float *adam_m_ptr, - float *adam_v_ptr, - float *weight_ptr); + float epsilon, + float const *WGrad, + float *M, + float *V, + float *W); -} // namespace FlexFlow +class AdamOptimizer { +public: + static __host__ void ps_update_task_gpu(AdamOptimizer const *op, + float const *w_grad_ptr, + size_t size, + int num_replicas, + float *w_ptr, + float *v_ptr, + float *m_ptr); +#ifdef FF_USE_NCCL + static __host__ void nccl_update_task_gpu(AdamOptimizer const *op, + PerDeviceOpState const *meta, + float const *w_grad_ptr, + size_t size, + float *w_ptr, + float *v_ptr, + float *m_ptr); #endif + +public: + float alpha; + float alpha_t; + float beta1; + float beta2; + float weight_decay; + float epsilon; +}; + +} // namespace FlexFlow + +#endif // _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h index 64ef1a1352..9a303952d0 100644 --- a/lib/kernels/include/kernels/partition_kernels.h +++ b/lib/kernels/include/kernels/partition_kernels.h @@ -13,8 +13,7 @@ struct RepartitionPerDeviceState { FF_VISITABLE_STRUCT_NO_EQ(RepartitionPerDeviceState, handle, data_type); -namespace Kernels { -namespace Repartition { +namespace Kernels::Repartition { RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle, DataType data_type); @@ -26,11 +25,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, RepartitionPerDeviceState const &m, - GenericTensorAccessorW const &output_grad, - GenericTensorAccessorR const &input_grad); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); -} // namespace Repartition -} // namespace Kernels +} // namespace Kernels::Repartition } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H diff --git a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml similarity index 100% rename from lib/local-execution/include/local-execution/per_device_op_state.variant.toml rename to lib/kernels/include/kernels/per_device_op_state.variant.toml diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h index 798c0507f8..ad0a52efb9 100644 --- a/lib/kernels/include/kernels/pool_2d_kernels.h +++ b/lib/kernels/include/kernels/pool_2d_kernels.h @@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Pool2DPerDeviceState, poolDesc, relu); -namespace Kernels { -namespace Pool2D { +namespace Kernels::Pool2D { Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle, std::optional activation, @@ -68,15 +67,14 @@ void forward_kernel(ffStream_t stream, void const *input_ptr, void *output_ptr); -void backward_kernel(ffStream_t stream, +void backward_kernel(cudaStream_t stream, Pool2DPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, void const *output_ptr, - void const *output_grad_ptr); + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr); -} // namespace Pool2D -} // namespace Kernels +} // namespace Kernels::Pool2D } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h index 4287472875..cd3930ea1c 100644 --- a/lib/kernels/include/kernels/reduce_kernels.h +++ b/lib/kernels/include/kernels/reduce_kernels.h @@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT(ReducePerDeviceState, op_type, reduction_size); -namespace Kernels { -namespace Reduce { +namespace Kernels::Reduce { ReducePerDeviceState init_kernel(PerDeviceFFHandle const &, OperatorType const &, @@ -43,8 +42,7 @@ void backward_kernel(ffStream_t stream, ReducePerDeviceState const &m, float const *output_grad_ptr, float *input_grad_ptr); -} // namespace Reduce -} // namespace Kernels +} // namespace Kernels::Reduce } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h index fb3baf215c..12553edd5e 100644 --- a/lib/kernels/include/kernels/reduction_kernels.h +++ b/lib/kernels/include/kernels/reduction_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Reduction { +namespace FlexFlow::Kernels::Reduction { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -14,11 +12,9 @@ void forward_kernel(ffStream_t stream, size_t num_replicas); void backward_kernel(ffStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); -} // namespace Reduction -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reduction #endif // _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h index 409fc81f44..7ed55cd1a1 100644 --- a/lib/kernels/include/kernels/replicate_kernels.h +++ b/lib/kernels/include/kernels/replicate_kernels.h @@ -4,21 +4,17 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Replicate { +namespace FlexFlow::Kernels::Replicate { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas); -} // namespace Replicate -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Replicate #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h new file mode 100644 index 0000000000..1c7aa4ee4a --- /dev/null +++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h @@ -0,0 +1,18 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H + +#include "device.h" +#include "kernels/accessor.h" + +namespace FlexFlow::Kernels::Replicate { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, + size_t num_replicas); + +} // namespace FlexFlow::Kernels::Replicate + +#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index a83caa6bea..6e19a9d251 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -13,8 +13,7 @@ struct ReshapePerDeviceState { FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type); -namespace Kernels { -namespace Reshape { +namespace Kernels::Reshape { ReshapePerDeviceState init_kernel(DataType data_type); @@ -25,11 +24,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, ReshapePerDeviceState const &per_device_state, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); -} // namespace Reshape -} // namespace Kernels +} // namespace Kernels::Reshape } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H diff --git a/lib/kernels/include/kernels/reverse_kernels.h b/lib/kernels/include/kernels/reverse_kernels.h index 42a83ae219..deb5b22155 100644 --- a/lib/kernels/include/kernels/reverse_kernels.h +++ b/lib/kernels/include/kernels/reverse_kernels.h @@ -3,9 +3,7 @@ #include "device.h" -namespace FlexFlow { -namespace Kernels { -namespace Reverse { +namespace FlexFlow::Kernels::Reverse { void forward_kernel(ffStream_t stream, float const *in_ptr, @@ -23,8 +21,6 @@ void backward_kernel(ffStream_t stream, coord_t in_blk_size, coord_t input_size); -} // namespace Reverse -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reverse #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h new file mode 100644 index 0000000000..35af06aafb --- /dev/null +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Reverse { + +void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor); + +void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor); +} // namespace FlexFlow::Kernels::Reverse + +#endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h index 061230ec52..520ea61b64 100644 --- a/lib/kernels/include/kernels/softmax_kernels.h +++ b/lib/kernels/include/kernels/softmax_kernels.h @@ -15,8 +15,7 @@ struct SoftmaxPerDeviceState { FF_VISITABLE_STRUCT(SoftmaxPerDeviceState, handle, inputTensor, dim); -namespace Kernels { -namespace Softmax { +namespace Kernels::Softmax { SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle, int dim, @@ -31,12 +30,11 @@ void forward_kernel(ffStream_t stream, float *output_ptr); void backward_kernel(ffStream_t stream, - float *input_grad_ptr, float const *output_grad_ptr, + float *input_grad_ptr, size_t num_elements); -} // namespace Softmax -} // namespace Kernels +} // namespace Kernels::Softmax } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/split_kernels.h b/lib/kernels/include/kernels/split_kernels.h index 36434d4be8..538b9602c2 100644 --- a/lib/kernels/include/kernels/split_kernels.h +++ b/lib/kernels/include/kernels/split_kernels.h @@ -3,10 +3,7 @@ #include "device.h" -namespace FlexFlow { - -namespace Kernels { -namespace Split { +namespace FlexFlow::Kernels::Split { void forward_kernel(ffStream_t stream, float **out_ptrs, float const *in_ptr, @@ -22,8 +19,6 @@ void backward_kernel(ffStream_t stream, coord_t num_blks, int numOutputs); -} // namespace Split -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Split #endif // _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H diff --git a/lib/kernels/include/kernels/topk_kernels.h b/lib/kernels/include/kernels/topk_kernels.h index ae1c739f6c..6f33381e1a 100644 --- a/lib/kernels/include/kernels/topk_kernels.h +++ b/lib/kernels/include/kernels/topk_kernels.h @@ -12,8 +12,7 @@ struct TopKPerDeviceState { FF_VISITABLE_STRUCT(TopKPerDeviceState, sorted); -namespace Kernels { -namespace TopK { +namespace Kernels::TopK { TopKPerDeviceState init_kernel(bool sorted); @@ -35,8 +34,7 @@ void backward_kernel(ffStream_t stream, int length, int k); -} // namespace TopK -} // namespace Kernels +} // namespace Kernels::TopK } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h index 56da81ba2b..dbf78826cb 100644 --- a/lib/kernels/include/kernels/transpose_kernels.h +++ b/lib/kernels/include/kernels/transpose_kernels.h @@ -16,8 +16,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(TransposePerDeviceState, num_dim, perm); -namespace Kernels { -namespace Transpose { +namespace Kernels::Transpose { TransposePerDeviceState init_kernel(int num_dim, std::vector const &perm); @@ -29,11 +28,10 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, TransposePerDeviceState const &m, - GenericTensorAccessorW const &in_grad, - GenericTensorAccessorR const &out_grad); + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad); -} // namespace Transpose -} // namespace Kernels +} // namespace Kernels::Transpose } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index 27b7eb390d..e56bded737 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -1,7 +1,68 @@ #include "kernels/accessor.h" +#include "kernels/allocation.h" +#include "kernels/datatype_dispatch.h" namespace FlexFlow { +void copy_accessor_data_to_l_from_r( + GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor) { + size_t num_bytes = dst_accessor.shape.get_volume() * + size_of_datatype(dst_accessor.data_type); + + DeviceType dst_device_type = dst_accessor.device_type; + DeviceType src_device_type = src_accessor.device_type; + + if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::CPU) { + memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes); + } else if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::GPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice)); + } else if (src_device_type == DeviceType::GPU && + dst_device_type == DeviceType::CPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost)); + } else { + assert(src_device_type == DeviceType::GPU); + assert(dst_device_type == DeviceType::GPU); + checkCUDA(cudaMemcpy(dst_accessor.ptr, + src_accessor.ptr, + num_bytes, + cudaMemcpyDeviceToDevice)); + } +} + +GenericTensorAccessorW::operator GenericTensorAccessorR() const { + return read_only_accessor_from_write_accessor(*this); +} + +GenericTensorAccessorW::GenericTensorAccessorW( + DataType data_type, + ArrayShape const &shape, + void *ptr, + DeviceType device_type = DeviceType::GPU) + : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + +std::tuple + GenericTensorAccessorW::tie() const { + return std::tie(this->data_type, this->shape, this->ptr, this->device_type); +} + +bool GenericTensorAccessorW::operator==( + GenericTensorAccessorW const &other) const { + return this->tie() == other.tie(); +} + +bool GenericTensorAccessorW::operator!=( + GenericTensorAccessorW const &other) const { + return this->tie() != other.tie(); +} + int32_t *GenericTensorAccessorW::get_int32_ptr() const { return this->get(); } @@ -33,6 +94,31 @@ std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) { return (s << fmt::to_string(a)); } +GenericTensorAccessorR::GenericTensorAccessorR( + DataType data_type, + ArrayShape const &shape, + void const *ptr, + DeviceType device_type = DeviceType::GPU) + : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + +std::tuple + GenericTensorAccessorR::tie() const { + return std::tie(this->data_type, this->shape, this->ptr, this->device_type); +} + +bool GenericTensorAccessorR::operator==( + GenericTensorAccessorR const &other) const { + return this->tie() == other.tie(); +} + +bool GenericTensorAccessorR::operator!=( + GenericTensorAccessorR const &other) const { + return this->tie() != other.tie(); +} + int32_t const *GenericTensorAccessorR::get_int32_ptr() const { return this->get(); } @@ -64,51 +150,6 @@ std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) { return (s << fmt::to_string(a)); } -int32_t *get_int32_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -int64_t *get_int64_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -float *get_float_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -double *get_double_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -half *get_half_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -std::vector - get_int32_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_int64_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_float_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_double_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_half_ptrs(std::vector const &a) { - return get(a); -} - int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) { return get(a); } @@ -156,22 +197,17 @@ std::vector GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &writable) { - return GenericTensorAccessorR{ - writable.data_type, writable.shape, req(writable.ptr)}; + return GenericTensorAccessorR{writable.data_type, + writable.shape, + req(writable.ptr), + writable.device_type}; } -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2) { +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2) { return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; } -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype) { - return accessor.shape == expected_shape && - accessor.data_type == expected_dtype; -} - bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, ArrayShape const &expected_shape, DataType const &expected_dtype) { @@ -184,9 +220,4 @@ std::pair return std::make_pair(accessor.shape, accessor.data_type); } -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor) { - return std::make_pair(accessor.shape, accessor.data_type); -} - } // namespace FlexFlow diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index ccd88580db..733146851a 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -11,10 +11,17 @@ void Allocator::deallocate(void *ptr) { this->i_allocator->deallocate(ptr); } +DeviceType Allocator::get_allocation_device_type() const { + return this->i_allocator->get_allocation_device_type(); +} + GenericTensorAccessorW Allocator::allocate_tensor(TensorShape const &tensor_shape) { void *ptr = this->allocate(get_size_in_bytes(tensor_shape)); - return {tensor_shape.data_type, tensor_shape, ptr}; + return {tensor_shape.data_type, + tensor_shape, + ptr, + this->get_allocation_device_type()}; } } // namespace FlexFlow diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index d5e2f1167d..5c18a9ab5a 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -53,6 +53,7 @@ std::size_t ArrayShape::at(ff_dim_t idx) const { ArrayShape ArrayShape::sub_shape( std::optional> start, std::optional> end) const { + NOT_IMPLEMENTED(); } diff --git a/lib/kernels/src/copy_tensor_accessor.cc b/lib/kernels/src/copy_tensor_accessor.cc new file mode 100644 index 0000000000..6a3ad8033a --- /dev/null +++ b/lib/kernels/src/copy_tensor_accessor.cc @@ -0,0 +1,48 @@ +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow { + +template +struct CopyTensorAccessorW { + GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return dst_accessor; + } +}; + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, allocator); +} + +template +struct CopyTensorAccessorR { + GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return read_only_accessor_from_write_accessor(dst_accessor); + } +}; + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, allocator); +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc new file mode 100644 index 0000000000..08f5552afc --- /dev/null +++ b/lib/kernels/src/cpu/cast_kernels.cc @@ -0,0 +1,51 @@ +#include "kernels/cast_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow::Kernels::Cast { + +template +void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) { + for (size_t i = 0; i < volume; ++i) { + output[i] = static_cast(input[i]); + } +} + +template +void cpu_cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) { + for (size_t i = 0; i < volume; i++) { + output[i] = static_cast(input[i]) + beta * output[i]; + } +} + +template +struct CPUForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + size_t volume = input.shape.get_volume(); + cpu_cast_forward(input.get(), output.get(), volume); + } +}; + +template +struct CPUBackwardKernel { + void operator()(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + size_t volume = output.shape.get_volume(); + cpu_cast_backward( + output.get(), input.get(), volume, cast_to(1.0f)); + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch2{}( + input.data_type, output.data_type, input, output); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch2{}( + output.data_type, input.data_type, output, input); +} + +} // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc new file mode 100644 index 0000000000..d0be1f9f2d --- /dev/null +++ b/lib/kernels/src/cpu/combine_kernels.cc @@ -0,0 +1,38 @@ +#include "kernels/combine_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow::Kernels::Combine { + +template +struct CPUForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + memcpy(output.get
(), + input.get
(), + input.shape.get_volume() * size_of_datatype(DT)); + } +}; + +template +struct CPUBackwardKernel { + void operator()(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + size_t num_elements = output_grad.shape.get_volume(); + for (int i = 0; i < num_elements; ++i) { + input_grad.get
()[i] += output_grad.get
()[i]; + } + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, input, output); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + DataTypeDispatch1{}( + input_grad.data_type, output_grad, input_grad); +} + +} // namespace FlexFlow::Kernels::Combine diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc new file mode 100644 index 0000000000..cfcb44dac5 --- /dev/null +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -0,0 +1,44 @@ +#include "kernels/datatype_dispatch.h" +#include "kernels/replicate_kernels_cpu.h" + +namespace FlexFlow::Kernels::Replicate { + +template +struct CPUForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output) { + memcpy(output.get
(), + input.get
(), + input.shape.num_elements() * size_of_datatype(DT)); + } +}; + +template +struct CPUBackwardKernel { + void operator()(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, + size_t num_replicas) { + using T = real_type_t
; + for (int i = 0; i < input.shape.num_elements(); i++) { + T cur_sum = 0; + for (int j = 0; j < num_replicas; j++) { + cur_sum += output.at
({i, j}); + } + input.at
({i}) = cur_sum; + } + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output) { + DataTypeDispatch1{}(input.data_type, input, output); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, + size_t num_replicas) { + DataTypeDispatch1{}( + input.data_type, output, input, num_replicas); +} + +} // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc new file mode 100644 index 0000000000..bc73c80e9e --- /dev/null +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -0,0 +1,43 @@ +#include "kernels/datatype_dispatch.h" +#include "kernels/reverse_kernels_cpu.h" +#include +#include + +namespace FlexFlow::Kernels::Reverse { + +template +struct CPUReverseForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output) { + assert(input.data_type == DT && output.data_type == DT); + + int num_out_blocks = input.shape.at(legion_dim_t(0)); + int reverse_dim_size = input.shape.at(legion_dim_t(1)); + int in_block_size = input.shape.at(legion_dim_t(2)); + + for (int block_idx = 0; block_idx < num_out_blocks; block_idx++) { + for (int rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { + for (int i = 0; i < in_block_size; i++) { + output.at
({block_idx, rev_idx, i}) = + input.at
({num_out_blocks - 1 - block_idx, + reverse_dim_size - 1 - rev_idx, + in_block_size - 1 - i}); + } + } + } + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor) { + DataTypeDispatch1{}( + input_accessor.data_type, input_accessor, output_accessor); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor) { + DataTypeDispatch1{}( + output_accessor.data_type, output_accessor, input_accessor); +} + +} // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu index 2ff02038f4..b30cf6a663 100644 --- a/lib/kernels/src/cuda/cuda_helper.cu +++ b/lib/kernels/src/cuda/cuda_helper.cu @@ -29,13 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) { #error "Unknown device, please make sure if CUDA is enabled" #endif -__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { +__global__ void scale_kernel(float *ptr, size_t size, float a, float b) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = (b - a) * ptr[i] + a; } } -__global__ void ones_kernel(float *ptr, coord_t size) { +__global__ void ones_kernel(float *ptr, size_t size) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = 1.0f; } @@ -49,7 +49,7 @@ __global__ void assign_kernel(DT *ptr, size_t size, DT value) { } template -__global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { +__global__ void copy_kernel(DT *dst, const DT *src, size_t size) { CUDA_KERNEL_LOOP(i, size) { dst[i] = src[i]; } @@ -281,11 +281,11 @@ template __global__ void add_kernel(bool *dst, bool const *src, unsigned long size); template __global__ void - copy_kernel(float *dst, float const *src, coord_t size); + copy_kernel(float *dst, float const *src, size_t size); template __global__ void - copy_kernel(int32_t *dst, int32_t const *src, coord_t size); + copy_kernel(int32_t *dst, int32_t const *src, size_t size); template __global__ void - copy_kernel(int64_t *dst, int64_t const *src, coord_t size); + copy_kernel(int64_t *dst, int64_t const *src, size_t size); template __global__ void apply_add_with_scale(float *data_ptr, float const *grad_ptr, diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu index e6a614ba70..c83e9f0a94 100644 --- a/lib/kernels/src/cuda/embedding_kernels.cu +++ b/lib/kernels/src/cuda/embedding_kernels.cu @@ -17,12 +17,11 @@ #include "kernels/datatype_dispatch.h" #include "kernels/embedding_kernels.h" -namespace FlexFlow { -namespace Kernels { -namespace Embedding { +namespace FlexFlow::Kernels::Embedding { void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); // Randomly initialize the intput tensor to avoid out of index range issues rand_generate_int<<>>( @@ -31,36 +30,14 @@ void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p) { cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); // Randomly initialize the intput tensor to avoid out of index range issues rand_generate_int<<>>( ptr, size, p); } -template -__global__ void embed_forward_no_aggr( - TI const *input, TD *output, TD const *embed, int out_dim, int batch_size); -template -__global__ void embed_forward_with_aggr(TI const *input, - TD *output, - TD const *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr); -template -__global__ void embed_backward_no_aggr( - TI const *input, TD const *output, TD *embed, int out_dim, int batch_size); -template -__global__ void embed_backward_with_aggr(TI const *input, - TD const *output, - TD *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr); - -template +template __global__ void embed_forward_no_aggr(int32_t const *input, TD *output, TD const *embed, @@ -75,7 +52,7 @@ __global__ void embed_forward_no_aggr(int32_t const *input, } } -template +template __global__ void embed_forward_no_aggr(int64_t const *input, TD *output, TD const *embed, @@ -90,14 +67,14 @@ __global__ void embed_forward_no_aggr(int64_t const *input, } } -template +template __global__ void embed_forward_with_aggr(int32_t const *input, TD *output, TD const *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; @@ -115,14 +92,14 @@ __global__ void embed_forward_with_aggr(int32_t const *input, } } -template +template __global__ void embed_forward_with_aggr(int64_t const *input, TD *output, TD const *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; @@ -140,7 +117,7 @@ __global__ void embed_forward_with_aggr(int64_t const *input, } } -template +template __global__ void embed_backward_no_aggr(int32_t const *input, TD const *output, TD *embed, @@ -154,7 +131,7 @@ __global__ void embed_backward_no_aggr(int32_t const *input, } } -template +template __global__ void embed_backward_no_aggr(int64_t const *input, TD const *output, TD *embed, @@ -171,11 +148,11 @@ __global__ void embed_backward_no_aggr(int64_t const *input, // Specialization for half type template <> -__global__ void embed_backward_no_aggr(int32_t const *input, - half const *output, - half *embed, - int out_dim, - int batch_size) { +__global__ void embed_backward_no_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; @@ -192,11 +169,11 @@ __global__ void embed_backward_no_aggr(int32_t const *input, } template <> -__global__ void embed_backward_no_aggr(int64_t const *input, - half const *output, - half *embed, - int out_dim, - int batch_size) { +__global__ void embed_backward_no_aggr(int64_t const *input, + half const *output, + half *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; @@ -212,14 +189,14 @@ __global__ void embed_backward_no_aggr(int64_t const *input, } } -template +template __global__ void embed_backward_with_aggr(int32_t const *input, TD const *output, TD *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -238,14 +215,14 @@ __global__ void embed_backward_with_aggr(int32_t const *input, } } -template +template __global__ void embed_backward_with_aggr(int64_t const *input, TD const *output, TD *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -267,14 +244,13 @@ __global__ void embed_backward_with_aggr(int64_t const *input, // Specialization for half type template <> -__global__ void - embed_backward_with_aggr(int32_t const *input, - half const *output, - half *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr) { +__global__ void embed_backward_with_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -301,14 +277,13 @@ __global__ void } template <> -__global__ void - embed_backward_with_aggr(int64_t const *input, - half const *output, - half *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr) { +__global__ void embed_backward_with_aggr(int64_t const *input, + half const *output, + half *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -351,35 +326,219 @@ struct ForwardKernel { int in_dim, int out_dim, int batch_size) { - assert(input.data_type == DataType::INT32 || - input.data_type == DataType::INT64); - assert(weight.data_type == DataType::HALF || - weight.data_type == DataType::FLOAT || - weight.data_type == DataType::DOUBLE); + throw mk_runtime_error(fmt::format( + "Invalid type combination: input type {} and output type {}", TI, TD)); + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { if (!aggr.has_value()) { - embed_forward_no_aggr, real_type_t> + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr <<>>(input.get(), - output.get(), - weight.get(), + stream>>>(input.get(), + output.get(), + weight.get(), out_dim, - batch_size); + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - embed_forward_with_aggr, real_type_t> + embed_forward_with_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr <<>>(input.get(), - output.get(), - weight.get(), + stream>>>(input.get(), + output.get(), + weight.get(), out_dim, in_dim, batch_size, - aggr); + aggr.value()); } } }; @@ -388,39 +547,229 @@ template struct BackwardKernel { void operator()(cudaStream_t stream, std::optional aggr, + GenericTensorAccessorR const &output, GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + throw mk_runtime_error(fmt::format( + "Invalid type combination: input type {} and output type {}", TI, TD)); + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, int in_dim, int out_dim, int batch_size) { - assert(input.data_type == DataType::INT32 || - input.data_type == DataType::INT64); - assert(output.data_type == DataType::HALF || - output.data_type == DataType::FLOAT || - output.data_type == DataType::DOUBLE); if (!aggr.has_value()) { - embed_backward_no_aggr, real_type_t> + embed_backward_no_aggr <<>>(input.get(), - output.get(), - weight_grad.get(), + stream>>>(input.get(), + output.get(), + weight_grad.get(), out_dim, batch_size); } else { - embed_backward_with_aggr, real_type_t> + embed_backward_with_aggr <<>>(input.get(), - output.get(), - weight_grad.get(), + stream>>>(input.get(), + output.get(), + weight_grad.get(), out_dim, in_dim, batch_size, - aggr); + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); } } }; @@ -448,27 +797,25 @@ void forward_kernel(ffStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorR const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, - DataType input_data_type, DataType output_data_type, + DataType input_data_type, std::optional aggr, int in_dim, int out_dim, int batch_size) { - DataTypeDispatch2{}(input_data_type, - output_data_type, + DataTypeDispatch2{}(output_data_type, + input_data_type, stream, aggr, - input, output, + input, weight_grad, in_dim, out_dim, batch_size); } -} // namespace Embedding -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Embedding diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu index 2e037eb472..0250f829ec 100644 --- a/lib/kernels/src/cuda/metrics_functions.cu +++ b/lib/kernels/src/cuda/metrics_functions.cu @@ -13,17 +13,42 @@ * limitations under the License. */ -#include "flexflow/model.h" -#include "flexflow/utils/cuda_helper.h" +#include "device.h" +#include "kernels/metrics_kernels.h" +#include "kernels/perf_metrics.h" +#include "pcg/metric.h" namespace FlexFlow { +struct CUDAPerfMetrics { + int train_all; + int train_correct; + float cce_loss; + float sparse_cce_loss; + float mse_loss; + float rmse_loss; + float mae_loss; + double start_time; + double current_time; + + CUDAPerfMetrics() = delete; + CUDAPerfMetrics(PerfMetrics const &perf) + : train_all(perf.train_all), + train_correct(perf.train_correct.value_or(-1)), + cce_loss(perf.cce_loss.value_or(-1)), + sparse_cce_loss(perf.sparse_cce_loss.value_or(-1)), + mse_loss(perf.mse_loss.value_or(-1)), + rmse_loss(perf.rmse_loss.value_or(-1)), + mae_loss(perf.mae_loss.value_or(-1)), start_time(perf.start_time), + current_time(perf.current_time) {} +}; + float const LOG_MIN_VALUE = 0.00000001f; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, - PerfMetrics *perf, - const Metrics metrics, + CUDAPerfMetrics *perf, + const MetricsAttrs metrics, int num_samples, int num_classes) { CUDA_KERNEL_LOOP(b, num_samples) { @@ -72,8 +97,8 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, __global__ void update_metrics_label_kernel(float const *logits, float const *labels, - PerfMetrics *perf, - const Metrics metrics, + CUDAPerfMetrics *perf, + const MetricsAttrs metrics, int num_samples, int num_classes) { CUDA_KERNEL_LOOP(b, num_samples) { @@ -136,17 +161,17 @@ __global__ void update_metrics_label_kernel(float const *logits, } } -void Metrics::update_metrics_sparse_label_kernel_wrapper( - float const *logit_ptr, - int const *label_ptr, - Metrics const *me, - int num_effective_samples, - int num_classes, - PerfMetrics &perf_zc) { - PerfMetrics *perf; - checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics))); - checkCUDA( - cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice)); +void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, + int const *label_ptr, + MetricsAttrs const *me, + int num_effective_samples, + int num_classes, + PerfMetrics &perf_zc) { + CUDAPerfMetrics perf(perf_zc); + CUDAPerfMetrics *perf_cuda; + checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics))); + checkCUDA(cudaMemcpy( + perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice)); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -154,32 +179,33 @@ void Metrics::update_metrics_sparse_label_kernel_wrapper( CUDA_NUM_THREADS, 0, stream>>>( - logit_ptr, label_ptr, perf, *me, num_effective_samples, num_classes); + logit_ptr, label_ptr, perf_cuda, *me, num_effective_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); - checkCUDA( - cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost)); - checkCUDA(cudaFree(perf)); + checkCUDA(cudaMemcpy( + &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); + checkCUDA(cudaFree(perf_cuda)); } -void Metrics::update_metrics_label_kernel_wrapper(float const *logit_ptr, - float const *label_ptr, - Metrics const *me, - int num_samples, - int num_classes, - PerfMetrics &perf_zc) { - PerfMetrics *perf; - checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics))); - checkCUDA( - cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice)); +void update_metrics_label_kernel_wrapper(float const *logit_ptr, + float const *label_ptr, + MetricsAttrs const *me, + int num_samples, + int num_classes, + PerfMetrics &perf_zc) { + CUDAPerfMetrics perf(perf_zc); + CUDAPerfMetrics *perf_cuda; + checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics))); + checkCUDA(cudaMemcpy( + perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice)); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); update_metrics_label_kernel<<>>( - logit_ptr, label_ptr, perf, *me, num_samples, num_classes); + logit_ptr, label_ptr, perf_cuda, *me, num_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); - checkCUDA( - cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost)); - checkCUDA(cudaFree(perf)); + checkCUDA(cudaMemcpy( + &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); + checkCUDA(cudaFree(perf_cuda)); } }; // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu index 6c6e17a181..512981e32b 100644 --- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu +++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu @@ -53,9 +53,9 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, BatchNormPerDeviceState const &m, - float const *input_ptr, - float *output_grad_ptr, float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, float *input_grad_ptr, float const *scale_ptr, float *scale_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu index b895ffb68f..afc3e1f7ef 100644 --- a/lib/kernels/src/cuda/ops/cast_kernels.cu +++ b/lib/kernels/src/cuda/ops/cast_kernels.cu @@ -50,30 +50,26 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - size_t volume = input.shape.get_volume(); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + size_t volume = output.shape.get_volume(); cast_backward<<>>( - input.get(), output.get(), volume, cast_to(1.0f)); + output.get(), input.get(), volume, cast_to(1.0f)); } }; void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + input.data_type, output.data_type, stream, input, output); } void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + output.data_type, input.data_type, stream, output, input); } } // namespace Cast diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu index 68004738d2..ad216feda2 100644 --- a/lib/kernels/src/cuda/ops/concat_kernels.cu +++ b/lib/kernels/src/cuda/ops/concat_kernels.cu @@ -17,9 +17,7 @@ #include "kernels/concat_kernels.h" #include -namespace FlexFlow { -namespace Kernels { -namespace Concat { +namespace FlexFlow::Kernels::Concat { void calc_blk_size(size_t &num_blocks, size_t &blk_size, @@ -87,6 +85,4 @@ void backward_kernel(cudaStream_t stream, } } -} // namespace Concat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Concat diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu index e3a4c97a31..0a4024ba8a 100644 --- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu @@ -313,10 +313,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *filter_ptr, float *filter_grad_ptr, float *bias_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu index a35d28fa8c..687a9fa220 100644 --- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu +++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu @@ -290,10 +290,10 @@ struct BackwardKernel { OperatorType op_type, std::optional scalar, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { checkCUDNN(cudnnSetStream(handle.dnn, stream)); if (use_cudnn(op_type)) { @@ -356,20 +356,20 @@ void backward_kernel(ffStream_t stream, ElementUnaryPerDeviceState const &device_state, ElementUnaryAttrs const &attrs, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { DataTypeDispatch1{}(input.data_type, stream, device_state, get_op_type(attrs), attrs.scalar, handle, - input, - input_grad, output, - output_grad); + output_grad, + input, + input_grad); } } // namespace ElementUnary diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu index 941db108a0..f661e5fb0a 100644 --- a/lib/kernels/src/cuda/ops/flat_kernels.cu +++ b/lib/kernels/src/cuda/ops/flat_kernels.cu @@ -34,8 +34,8 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, GenericTensorAccessorR input, - float *input_grad_ptr, - float const *output_grad_ptr) { + float const *output_grad_ptr, + float *input_grad_ptr) { float alpha = 1.0f; apply_add_with_scale diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu index ca51f0d216..0d5a772918 100644 --- a/lib/kernels/src/cuda/ops/linear_kernels.cu +++ b/lib/kernels/src/cuda/ops/linear_kernels.cu @@ -108,10 +108,10 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, void forward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *output_ptr, - void const *weight_ptr, - void const *bias_ptr, + float const *input_ptr, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, int in_dim, int out_dim, int batch_size) { @@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream, batch_size, in_dim, &alpha, - weight_ptr, + static_cast(weight_ptr), weight_type, in_dim, - input_ptr, + static_cast(input_ptr), input_type, in_dim, &beta, - output_ptr, + static_cast(output_ptr), output_type, out_dim, compute_type, @@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream, batch_size, 1, &alpha, - bias_ptr, + static_cast(bias_ptr), weight_type, 1, - m.one_ptr, + static_cast(m.one_ptr), CUDA_R_32F, 1, &alpha, - output_ptr, + static_cast(output_ptr), output_type, out_dim, compute_type, @@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream, m.actiDesc, &alpha, m.outputTensor, - output_ptr, + static_cast(output_ptr), &beta, m.outputTensor, - output_ptr)); + static_cast(output_ptr))); } else if (m.activation == Activation::GELU) { size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size); constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) @@ -191,13 +191,13 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, - void const *output_ptr, - void *output_grad_ptr, - void const *kernel_ptr, - void *kernel_grad_ptr, - void *bias_grad_ptr, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_grad_ptr, int in_dim, int out_dim, int batch_size) { @@ -216,11 +216,17 @@ void backward_kernel(cudaStream_t stream, int output_size = out_dim * batch_size; if (m.activation.has_value()) { if (m.activation == Activation::RELU) { - relu_backward_kernel( - m.output_type, output_grad_ptr, output_ptr, output_size, stream); + relu_backward_kernel(m.output_type, + static_cast(output_grad_ptr), + static_cast(output_ptr), + output_size, + stream); } else if (m.activation == Activation::SIGMOID) { - sigmoid_backward_kernel( - m.output_type, output_grad_ptr, output_ptr, output_size, stream); + sigmoid_backward_kernel(m.output_type, + static_cast(output_grad_ptr), + static_cast(output_ptr), + output_size, + stream); } else { // TODO: only support relu and sigmoid for now assert(false && "Unsupported activation for Linear"); @@ -235,14 +241,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - input_ptr, + static_cast(input_ptr), input_type, in_dim, - output_grad_ptr, + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - kernel_grad_ptr, + static_cast(kernel_grad_ptr), weight_type, in_dim, compute_type, @@ -261,12 +267,12 @@ void backward_kernel(cudaStream_t stream, in_dim, out_dim, &alpha, - (float *)kernel_grad_ptr, + kernel_grad_ptr, in_dim, &lambda, - (float *)kernel_ptr, + kernel_ptr, in_dim, - (float *)kernel_grad_ptr, + kernel_grad_ptr, in_dim)); } else { assert(false && "Only L2 regularization is supported"); @@ -284,14 +290,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - m.one_ptr, + static_cast(m.one_ptr), CUDA_R_32F, 1, - output_grad_ptr, + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - bias_grad_ptr, + static_cast(bias_grad_ptr), weight_type, 1, compute_type, @@ -307,14 +313,14 @@ void backward_kernel(cudaStream_t stream, batch_size, out_dim, &alpha, - kernel_ptr, + static_cast(kernel_ptr), weight_type, in_dim, - output_grad_ptr, + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - input_grad_ptr, + static_cast(input_grad_ptr), input_type, in_dim, compute_type, diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu index 1d07efb5fa..3687c1cedf 100644 --- a/lib/kernels/src/cuda/ops/partition_kernels.cu +++ b/lib/kernels/src/cuda/ops/partition_kernels.cu @@ -39,8 +39,8 @@ template struct BackwardKernel { void operator()(cudaStream_t stream, RepartitionPerDeviceState const &m, - GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { add_kernel><<{}( - m.data_type, stream, m, input_grad, output_grad); + m.data_type, stream, m, output_grad, input_grad); } } // namespace Repartition diff --git a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu index 51fa29d289..f8b35ec885 100644 --- a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu @@ -112,10 +112,10 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, Pool2DPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, void const *output_ptr, - void const *output_grad_ptr) { + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu index 0c6ba7d8e3..9c3e8dcc40 100644 --- a/lib/kernels/src/cuda/ops/reduction_kernels.cu +++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu @@ -54,8 +54,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { checkCUDA(cudaMemcpyAsync(input.get(), output.get(), input.shape.num_elements() * size_of_datatype(T), @@ -73,9 +73,9 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { - DataTypeDispatch1{}(input.data_type, stream, input, output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch1{}(output.data_type, stream, output, input); } } // namespace Reduction diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu index 76bfbe2658..1aa61375f0 100644 --- a/lib/kernels/src/cuda/ops/replicate_kernels.cu +++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu @@ -50,8 +50,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas) { size_t total_elements = input.shape.num_elements() * num_replicas; replicate_backward_kernel> @@ -70,11 +70,11 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas) { DataTypeDispatch1{}( - input.data_type, stream, input, output, num_replicas); + input.data_type, stream, output, input, num_replicas); } } // namespace Replicate diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu index 5b7843a3a5..b7a328ca08 100644 --- a/lib/kernels/src/cuda/ops/reshape_kernels.cu +++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu @@ -42,8 +42,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { float alpha = 1.0f; apply_add_with_scale> <<{}(m.data_type, stream, input, output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch1{}(m.data_type, stream, output, input); } } // namespace Reshape diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index 8391a499df..2c25293c36 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -17,7 +17,6 @@ #include "kernels/reverse_kernels.h" namespace FlexFlow { - namespace Kernels { namespace Reverse { diff --git a/lib/kernels/src/cuda/ops/softmax_kernels.cu b/lib/kernels/src/cuda/ops/softmax_kernels.cu index 93ed85de18..d2498d08a4 100644 --- a/lib/kernels/src/cuda/ops/softmax_kernels.cu +++ b/lib/kernels/src/cuda/ops/softmax_kernels.cu @@ -61,8 +61,8 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - float *input_grad_ptr, float const *output_grad_ptr, + float *input_grad_ptr, size_t num_elements) { checkCUDA(cudaMemcpyAsync(input_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu index 3b3f80944d..37e1a08326 100644 --- a/lib/kernels/src/cuda/ops/transpose_kernels.cu +++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu @@ -91,8 +91,8 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, TransposePerDeviceState const &m, - GenericTensorAccessorW const &in_grad, - GenericTensorAccessorR const &out_grad) { + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad) { TransposeStrides info; info.num_dim = in_grad.shape.num_dims(); diff --git a/lib/kernels/src/cuda/optimizer_kernel.cu b/lib/kernels/src/cuda/optimizer_kernels.cu similarity index 81% rename from lib/kernels/src/cuda/optimizer_kernel.cu rename to lib/kernels/src/cuda/optimizer_kernels.cu index 439eed9dec..1c6954a0b0 100644 --- a/lib/kernels/src/cuda/optimizer_kernel.cu +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -13,7 +13,9 @@ * limitations under the License. */ +#include "device.h" #include "kernels/optimizer_kernels.h" +#include "utils/exception.h" namespace FlexFlow { @@ -80,13 +82,25 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); + + auto const &state = meta->raw_variant; + ncclComm_t comm = std::visit( + [](auto const &s) -> ncclComm_t { + using T = std::decay_t; + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + throw mk_runtime_error("State type does not support NCCL operations"); + } else { + return s.handle.ncclComm; + } + }, + state); + + checkNCCL(ncclAllReduce( + w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream)); + // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr); // print_tensor((float*)w_grad_ptr, 16, "[After ncclAllReduce]"); @@ -157,7 +171,7 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, for (int i = 1; i < num_replicas; i++) { float const *src = w_grad_ptr + i * size; add_kernel<<>>( - size, 1.0f, src, (float *)w_grad_ptr); + (float *)w_grad_ptr, src, size); } // checkCUDA(cudaDeviceSynchronize()); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", @@ -188,13 +202,24 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, // Use NCCL to sync gradients cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); + + auto const &state = meta->raw_variant; + ncclComm_t comm = std::visit( + [](auto const &s) -> ncclComm_t { + using T = std::decay_t; + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + throw mk_runtime_error("State type does not support NCCL operations"); + } else { + return s.handle.ncclComm; + } + }, + state); + + checkNCCL(ncclAllReduce( + w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream)); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", // op->alpha, op->alpha_t, op->weight_decay); // Step 2: Adam update diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc new file mode 100644 index 0000000000..5cf337c685 --- /dev/null +++ b/lib/kernels/src/local_cpu_allocator.cc @@ -0,0 +1,30 @@ +#include "kernels/local_cpu_allocator.h" +#include "kernels/device.h" +#include "utils/containers/contains_key.h" + +namespace FlexFlow { +void *LocalCPUAllocator::allocate(size_t requested_memory_size) { + void *ptr = malloc(requested_memory_size); + this->ptrs.insert({ptr, std::unique_ptr(ptr, free)}); + return ptr; +} + +void LocalCPUAllocator::deallocate(void *ptr) { + if (contains_key(this->ptrs, ptr)) { + this->ptrs.erase(ptr); + } else { + throw std::runtime_error( + "Deallocating a pointer that was not allocated by this Allocator"); + } +} + +DeviceType LocalCPUAllocator::get_allocation_device_type() const { + return DeviceType::CPU; +} + +Allocator create_local_cpu_memory_allocator() { + Allocator allocator = Allocator::create(); + return allocator; +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc index cdcfb017a0..416768a479 100644 --- a/lib/kernels/src/local_cuda_allocator.cc +++ b/lib/kernels/src/local_cuda_allocator.cc @@ -20,6 +20,10 @@ void LocalCudaAllocator::deallocate(void *ptr) { } } +DeviceType LocalCudaAllocator::get_allocation_device_type() const { + return DeviceType::GPU; +} + LocalCudaAllocator::~LocalCudaAllocator() { for (void *ptr : this->ptrs) { checkCUDA(cudaFree(ptr)); @@ -27,7 +31,8 @@ LocalCudaAllocator::~LocalCudaAllocator() { } Allocator create_local_cuda_memory_allocator() { - return Allocator::create(); + Allocator allocator = Allocator::create(); + return allocator; } } // namespace FlexFlow diff --git a/lib/kernels/src/managed_ff_stream.cc b/lib/kernels/src/managed_ff_stream.cc index 7385b6cc3e..f0348aa91c 100644 --- a/lib/kernels/src/managed_ff_stream.cc +++ b/lib/kernels/src/managed_ff_stream.cc @@ -1,28 +1,36 @@ #include "kernels/managed_ff_stream.h" +#include "utils/exception.h" namespace FlexFlow { ManagedFFStream::ManagedFFStream() : stream(new ffStream_t) { - checkCUDA(cudaStreamCreate(stream)); + checkCUDA(cudaStreamCreate(this->stream)); } ManagedFFStream::ManagedFFStream(ManagedFFStream &&other) noexcept : stream(std::exchange(other.stream, nullptr)) {} ManagedFFStream &ManagedFFStream::operator=(ManagedFFStream &&other) noexcept { - std::swap(this->stream, other.stream); + if (this != &other) { + this->cleanup(); + this->stream = std::exchange(other.stream, nullptr); + } return *this; } ManagedFFStream::~ManagedFFStream() { - if (stream != nullptr) { - checkCUDA(cudaStreamDestroy(*stream)); - delete stream; + this->cleanup(); +} + +void ManagedFFStream::cleanup() { + if (this->stream != nullptr) { + checkCUDA(cudaStreamDestroy(*this->stream)); + delete this->stream; } } ffStream_t const &ManagedFFStream::raw_stream() const { - return *stream; + return *this->stream; } } // namespace FlexFlow diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index c050e887b6..9f1737240e 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -3,14 +3,15 @@ namespace FlexFlow { -ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() { - handle = new PerDeviceFFHandle; - handle->workSpaceSize = 1024 * 1024; - handle->allowTensorOpMathConversion = true; - - checkCUDNN(cudnnCreate(&handle->dnn)); - checkCUBLAS(cublasCreate(&handle->blas)); - checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize)); +ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( + size_t workSpaceSize, bool allowTensorOpMathConversion) { + this->handle = new PerDeviceFFHandle{}; + this->handle->workSpaceSize = workSpaceSize; + this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion; + + checkCUDNN(cudnnCreate(&this->handle->dnn)); + checkCUBLAS(cublasCreate(&this->handle->blas)); + checkCUDA(cudaMalloc(&this->handle->workSpace, this->handle->workSpaceSize)); } ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( @@ -19,16 +20,23 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( ManagedPerDeviceFFHandle &ManagedPerDeviceFFHandle::operator=( ManagedPerDeviceFFHandle &&other) noexcept { - std::swap(this->handle, other.handle); + if (this != &other) { + this->cleanup(); + this->handle = std::exchange(other.handle, nullptr); + } return *this; } ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() { - if (handle != nullptr) { - checkCUDNN(cudnnDestroy(handle->dnn)); - checkCUBLAS(cublasDestroy(handle->blas)); - checkCUDA(cudaFree(handle->workSpace)); - delete handle; + this->cleanup(); +} + +void ManagedPerDeviceFFHandle::cleanup() { + if (this->handle != nullptr) { + checkCUDNN(cudnnDestroy(this->handle->dnn)); + checkCUBLAS(cublasDestroy(this->handle->blas)); + checkCUDA(cudaFree(this->handle->workSpace)); + delete this->handle; } } diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt index 007740b510..981f87b3d8 100644 --- a/lib/kernels/test/CMakeLists.txt +++ b/lib/kernels/test/CMakeLists.txt @@ -14,4 +14,5 @@ ff_add_test_executable( cudnn cudart cublas + pcg ) diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index d44129ece1..023233ecb0 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -13,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) { size_t qoSeqLength = 20, kvSeqLength = 20; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -33,16 +35,16 @@ TEST_SUITE(FF_TEST_SUITE) { kvSeqLength, false); - TensorShape query_shape = make_float_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, qSize}); - TensorShape key_shape = make_float_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, kSize}); - TensorShape value_shape = make_float_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, vSize}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, oProjSize}); + TensorShape query_shape = make_tensor_shape_from_legion_dims( + {qoSeqLength, num_samples, qSize}, DataType::FLOAT); + TensorShape key_shape = make_tensor_shape_from_legion_dims( + {kvSeqLength, num_samples, kSize}, DataType::FLOAT); + TensorShape value_shape = make_tensor_shape_from_legion_dims( + {kvSeqLength, num_samples, vSize}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {qoSeqLength, num_samples, oProjSize}, DataType::FLOAT); TensorShape weight_shape = - make_float_tensor_shape_from_legion_dims({state.weightSize}); + make_tensor_shape_from_legion_dims({state.weightSize}, DataType::FLOAT); GenericTensorAccessorW query_accessor = create_random_filled_accessor_w(query_shape, allocator); @@ -66,9 +68,7 @@ TEST_SUITE(FF_TEST_SUITE) { weight_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index 18e6977148..8a11a069f5 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -15,16 +15,18 @@ TEST_SUITE(FF_TEST_SUITE) { size_t seq_length = -1; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape_a = - make_float_tensor_shape_from_legion_dims({m, k, batch}); + make_tensor_shape_from_legion_dims({m, k, batch}, DataType::FLOAT); TensorShape input_shape_b = - make_float_tensor_shape_from_legion_dims({k, n, batch}); + make_tensor_shape_from_legion_dims({k, n, batch}, DataType::FLOAT); TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({m, n, batch}); + make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT); GenericTensorAccessorW a_accessor = create_random_filled_accessor_w(input_shape_a, allocator); diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 8487bbda6a..03a3a1ad40 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/batch_norm_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -9,7 +10,9 @@ TEST_SUITE(FF_TEST_SUITE) { size_t output_n = 1, output_c = 10, output_h = 10, output_w = 10; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -23,25 +26,25 @@ TEST_SUITE(FF_TEST_SUITE) { output_w, true); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape scale_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape bias_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); + TensorShape scale_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); + TensorShape bias_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); - GenericTensorAccessorW scale_accessor = - create_filled_accessor_w(scale_shape, allocator, 1.0f); + GenericTensorAccessorW scale_accessor = create_filled_accessor_w( + scale_shape, allocator, make_float_data_type_value(1)); SUBCASE("forward_kernel") { - GenericTensorAccessorW bias_accessor = - create_filled_accessor_w(bias_shape, allocator, 0.0f); + GenericTensorAccessorW bias_accessor = create_filled_accessor_w( + bias_shape, allocator, make_float_data_type_value(0)); Kernels::BatchNorm::forward_kernel(managed_stream.raw_stream(), state, @@ -50,10 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) { scale_accessor.get_float_ptr(), bias_accessor.get_float_ptr()); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { @@ -68,28 +68,18 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(), state, - input_accessor.get_float_ptr(), - output_grad_accessor.get_float_ptr(), output_accessor.get_float_ptr(), + output_grad_accessor.get_float_ptr(), + input_accessor.get_float_ptr(), input_grad_accessor.get_float_ptr(), scale_accessor.get_float_ptr(), scale_grad_accessor.get_float_ptr(), bias_grad_accessor.get_float_ptr(), input_accessor.shape.num_elements()); - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - std::vector host_scale_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(scale_grad_accessor)); - std::vector host_bias_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(bias_grad_accessor)); - - CHECK(contains_non_zero(host_input_grad_data)); - CHECK(contains_non_zero(host_scale_grad_data)); - CHECK(contains_non_zero(host_bias_grad_data)); + CHECK(contains_non_zero(input_grad_accessor)); + CHECK(contains_non_zero(scale_grad_accessor)); + CHECK(contains_non_zero(bias_grad_accessor)); } Kernels::BatchNorm::cleanup_kernel(allocator, diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index b110208bce..1be5839a9c 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -1,7 +1,7 @@ #include "doctest/doctest.h" #include "kernels/cast_kernels.h" +#include "kernels/cast_kernels_cpu.h" #include "test_utils.h" -#include using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { @@ -11,46 +11,68 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); TensorShape output_shape = - make_double_tensor_shape_from_legion_dims({100, 100}); - - GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + make_tensor_shape_from_legion_dims({100, 100}, DataType::DOUBLE); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - - Kernels::Cast::forward_kernel(managed_stream.raw_stream(), - input_accessor, - output_accessor, - DataType::FLOAT, - DataType::DOUBLE); + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); - std::vector host_double_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor, output_accessor); - CHECK(contains_non_zero(host_double_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { + GenericTensorAccessorR grad_output_accessor = + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW grad_input_accessor = - allocator.allocate_tensor(input_shape); - - Kernels::Cast::backward_kernel( - managed_stream.raw_stream(), - read_only_accessor_from_write_accessor(output_accessor), - grad_input_accessor, - DataType::DOUBLE, - DataType::FLOAT); - - std::vector host_grad_float_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(grad_input_accessor)); - CHECK(contains_non_zero(host_grad_float_data)); + create_zero_filled_accessor_w(input_shape, allocator); + + Kernels::Cast::backward_kernel(managed_stream.raw_stream(), + grad_output_accessor, + grad_input_accessor); + + CHECK(contains_non_zero(grad_input_accessor)); + } + } + + TEST_CASE("Check Cast Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = + make_tensor_shape_from_legion_dims({10, 2}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({10, 2}, DataType::DOUBLE); + + // Only calling forward kernel as backward kernel is exactly the same + SUBCASE("forward_kernel") { + // Run GPU Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Cast::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 2e1000cb95..a4688a1030 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -1,39 +1,37 @@ #include "doctest/doctest.h" #include "kernels/combine_kernels.h" +#include "kernels/combine_kernels_cpu.h" #include "test_utils.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test combine kernel") { - ManagedPerDeviceFFHandle managed_handle{}; + TEST_CASE("Call Combine Forward and Backward Kernels") { + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Combine::forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -41,9 +39,64 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor, input_grad_accessor); - std::vector host_input_grad = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad)); + CHECK(contains_non_zero(input_grad_accessor)); + } + } + + TEST_CASE("Check Combine Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = + make_tensor_shape_from_legion_dims({5, 5}, DataType::FLOAT); + TensorShape output_shape = input_shape; + + SUBCASE("forward_kernel") { + // Run GPU Combine Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(output_shape); + + Kernels::Combine::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Combine Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(output_shape); + + Kernels::Combine::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); + } + + SUBCASE("backward_kernel") { + // Run GPU Combine Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Combine::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu); + + // Run CPU Combine Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu, + input_grad_accessor_cpu); + + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index bf2a521b4e..b299f5dea8 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -1,29 +1,31 @@ #include "doctest/doctest.h" #include "kernels/concat_kernels.h" #include "test_utils.h" +#include "utils/containers/repeat.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test concat kernel forward and backward") { - size_t num_inputs = 3; - size_t size_per_input = 100; - ff_dim_t concat_axis = ff_dim_t(0); + size_t num_inputs = 2; + size_t size_per_input = 10; + ff_dim_t concat_axis = ff_dim_t{1}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({size_per_input}); - TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({size_per_input, num_inputs}); + make_tensor_shape_from_legion_dims({size_per_input}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {num_inputs, size_per_input}, DataType::FLOAT); Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { std::vector input_accessors = repeat(num_inputs, [&]() { - return read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + return create_random_filled_accessor_r(input_shape, allocator); }); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -33,17 +35,12 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessors, concat_axis); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); std::vector input_grad_accessors = repeat( num_inputs, [&]() { return allocator.allocate_tensor(input_shape); }); diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 81f3c7183a..4be2bdf7bb 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -1,6 +1,7 @@ #include "doctest/doctest.h" #include "kernels/dropout_kernels.h" #include "test_utils.h" +#include "utils/containers/count.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { @@ -13,11 +14,13 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10, 10}); + make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT); TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -30,8 +33,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -40,11 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_accessor = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - CHECK(contains_non_zero(host_output_accessor)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 70894858e3..0bb69aa1dc 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/flat_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -7,15 +8,18 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Flat Kernel") { Allocator allocator = create_local_cuda_memory_allocator(); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = input_shape; GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 2.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(2))); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -25,33 +29,21 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor, output_accessor.get_float_ptr()); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements(), 2.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 0.0f); - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(0)); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(1)); Kernels::Flat::backward_kernel(managed_stream.raw_stream(), input_accessor, - input_grad_accessor.get_float_ptr(), - output_grad_accessor.get_float_ptr()); - - std::vector backward_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor.get_float_ptr(), + input_grad_accessor.get_float_ptr()); - std::vector expected_output_data( - input_accessor.shape.num_elements(), 1.0f); - CHECK(backward_output_data == expected_output_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 88ac2f6889..7f97563217 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -5,24 +5,26 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Gather Forward and Backward Kernel") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); GatherPerDeviceState state = {managed_handle.raw_handle(), legion_dim_t(2)}; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({50}, DataType::FLOAT); GenericTensorAccessorR index_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -32,16 +34,12 @@ TEST_SUITE(FF_TEST_SUITE) { index_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = create_random_filled_accessor_w(input_shape, allocator); @@ -51,10 +49,7 @@ TEST_SUITE(FF_TEST_SUITE) { index_accessor, input_grad_accessor); - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 03b2f56bb9..7d7298f83d 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/layer_norm_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -11,13 +12,15 @@ TEST_SUITE(FF_TEST_SUITE) { float epsilon = 1e-5f; bool elementwise_affine = true; - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({batch_size, feature_size}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {batch_size, feature_size}, DataType::FLOAT); TensorShape output_shape = input_shape; TensorShape feature_shape = - make_float_tensor_shape_from_legion_dims({feature_size}); + make_tensor_shape_from_legion_dims({feature_size}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -31,16 +34,15 @@ TEST_SUITE(FF_TEST_SUITE) { epsilon); GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - GenericTensorAccessorW gamma_accessor = - create_filled_accessor_w(feature_shape, allocator, 1.0f); + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorW gamma_accessor = create_filled_accessor_w( + feature_shape, allocator, make_float_data_type_value(1)); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - GenericTensorAccessorW beta_accessor = - create_filled_accessor_w(feature_shape, allocator, 0.0f); + GenericTensorAccessorW beta_accessor = create_filled_accessor_w( + feature_shape, allocator, make_float_data_type_value(0)); Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(), state, @@ -52,8 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW gamma_grad_accessor = diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc new file mode 100644 index 0000000000..605aa6ffa1 --- /dev/null +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -0,0 +1,31 @@ +#include "doctest/doctest.h" +#include "kernels/managed_ff_stream.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("ManagedFFStream") { + ManagedFFStream base_stream{}; + ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); + + SUBCASE("move constructor") { + ManagedFFStream new_stream(std::move(base_stream)); + CHECK(&base_stream.raw_stream() == nullptr); + CHECK(&new_stream.raw_stream() == base_stream_ptr); + } + + SUBCASE("move assignment operator") { + SUBCASE("move assign to other") { + ManagedFFStream new_stream{}; + new_stream = std::move(base_stream); + CHECK(&base_stream.raw_stream() == nullptr); + CHECK(&new_stream.raw_stream() == base_stream_ptr); + } + + SUBCASE("move assign to self") { + base_stream = std::move(base_stream); + CHECK(&base_stream.raw_stream() == base_stream_ptr); + } + } + } +} diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc new file mode 100644 index 0000000000..de3e5b72b1 --- /dev/null +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -0,0 +1,38 @@ +#include "doctest/doctest.h" +#include "kernels/managed_per_device_ff_handle.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("ManagedPerDeviceFFHandle") { + ManagedPerDeviceFFHandle base_handle{1024 * 1024, true}; + PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); + + SUBCASE("constructor") { + CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024); + CHECK(base_handle.raw_handle().allowTensorOpMathConversion == true); + } + + SUBCASE("move constructor") { + ManagedPerDeviceFFHandle new_handle(std::move(base_handle)); + + CHECK(&base_handle.raw_handle() == nullptr); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } + + SUBCASE("move assignment operator") { + SUBCASE("move assign to other") { + ManagedPerDeviceFFHandle new_handle{1024 * 1024, true}; + new_handle = std::move(base_handle); + + CHECK(&base_handle.raw_handle() == nullptr); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } + + SUBCASE("move assign to self") { + base_handle = std::move(base_handle); + CHECK(&base_handle.raw_handle() == base_handle_ptr); + } + } + } +} diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 437b37e954..e88c811803 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -1,12 +1,15 @@ #include "doctest/doctest.h" #include "kernels/partition_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Partition Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -15,47 +18,33 @@ TEST_SUITE(FF_TEST_SUITE) { managed_handle.raw_handle(), DataType::FLOAT); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10, 10}); + make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + GenericTensorAccessorR input_accessor = create_filled_accessor_r( + input_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Repartition::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(1)); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(2)); Kernels::Repartition::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); - - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor, + input_grad_accessor); - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements(), 3.0f); - CHECK(host_grad_input_data == expected_grad_input_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index ebb92d39db..00fa968235 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/pool_2d_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -12,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) { PoolOp pool_type = PoolOp::MAX; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -36,10 +39,10 @@ TEST_SUITE(FF_TEST_SUITE) { stride_w, pool_type); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {input_w, input_h, input_c, input_n}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {output_w, output_h, output_c, output_n}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {input_w, input_h, input_c, input_n}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {output_w, output_h, output_c, output_n}, DataType::FLOAT); GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); @@ -52,28 +55,23 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.ptr, output_accessor.ptr); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w( + output_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Pool2D::backward_kernel(managed_stream.raw_stream(), state, - input_accessor.ptr, - input_grad_accessor.ptr, output_accessor.ptr, - output_grad_accessor.ptr); + output_grad_accessor.ptr, + input_accessor.ptr, + input_grad_accessor.ptr); - std::vector host_input_grad = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 1ea740f336..1c389cb20d 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/reduction_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -7,20 +8,22 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Reduction Forward and Backward Kernel") { std::size_t num_replicas = 5; - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10, 10, 10, 10, 10}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {10, 10, 10, 10, 10}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({10}); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({10}, DataType::FLOAT); GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -29,30 +32,22 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor, num_replicas); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { TensorShape output_shape = input_shape; - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Reduction::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor, - output_grad_accessor); - - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements(), 1.0f); - std::vector host_grad_data = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(host_grad_data == expected_grad_input_data); + output_grad_accessor, + input_grad_accessor); + + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 86d790f03c..27223cc7b5 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -1,55 +1,113 @@ #include "doctest/doctest.h" #include "kernels/replicate_kernels.h" +#include "kernels/replicate_kernels_cpu.h" #include "test_utils.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test Replicate Kernel") { + TEST_CASE("Call Replicate Forward and Backward Kernels") { std::size_t num_replicas = 10; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); - TensorShape output_shape = input_shape; + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Replicate::forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_random_filled_accessor_r(output_shape, allocator); + GenericTensorAccessorW input_grad_accessor = + allocator.allocate_tensor(input_shape); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor, output_grad_accessor, + input_grad_accessor, + num_replicas); + + CHECK(contains_non_zero(input_grad_accessor)); + } + } + + TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") { + std::size_t num_replicas = 2; + + TensorShape input_shape = + make_tensor_shape_from_legion_dims({5}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT); + + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("forward_kernel") { + // Run GPU Replicate Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Replicate::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Replicate Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); + } + + SUBCASE("backward_kernel") { + // Run GPU Replicate Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu, num_replicas); - std::vector check_aggregated_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(check_aggregated_data)); + // Run CPU Replicate Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Replicate::cpu_backward_kernel( + output_grad_accessor_cpu, input_grad_accessor_cpu, num_replicas); + + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index f56bfacc2b..5c04012da2 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -5,12 +5,15 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Reshape Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = input_shape; ReshapePerDeviceState state = @@ -18,42 +21,28 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Reshape::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + allocator.allocate_tensor(input_shape); Kernels::Reshape::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); - - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor, + input_grad_accessor); - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements(), 3.0f); - CHECK(host_grad_input_data == expected_grad_input_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index cdaf65a305..4adf79847a 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -1,5 +1,7 @@ #include "doctest/doctest.h" #include "kernels/reverse_kernels.h" +#include "kernels/reverse_kernels_cpu.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -9,18 +11,21 @@ TEST_SUITE(FF_TEST_SUITE) { std::size_t in_blk_size = 10; std::size_t num_out_blks = 1; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(1))); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -32,17 +37,14 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_accessor.shape.num_elements()); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(check_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + allocator.allocate_tensor(input_shape); Kernels::Reverse::backward_kernel( managed_stream.raw_stream(), @@ -53,10 +55,85 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_grad_accessor.shape.num_elements()); - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_grad_input_data)); + CHECK(contains_non_zero(input_grad_accessor)); + } + } + + TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { + std::size_t num_out_blks = 4; + std::size_t reverse_dim_size = 3; + std::size_t in_blk_size = 2; + + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); + TensorShape output_shape = input_shape; + + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("forward_kernel") { + auto transform = [counter = 0.0f](float val) mutable { + return counter++; + }; + + // Run GPU Cast Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), + input_accessor_gpu.get_float_ptr(), + output_accessor_gpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_accessor_gpu.shape.num_elements()); + + // Run CPU Cast Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu)); + } + + SUBCASE("backward_kernel") { + // Run GPU Cast Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Reverse::backward_kernel( + managed_stream.raw_stream(), + output_grad_accessor_gpu.get_float_ptr(), + input_grad_accessor_gpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_grad_accessor_gpu.shape.num_elements()); + + // Run CPU Cast Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu, + input_grad_accessor_cpu); + + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index f49c1ebbcc..5519c30b80 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -8,12 +8,15 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Softmax Kernel Operations") { int input_n = 1, input_c = 1, input_h = 1, input_w = 100, channels = 100; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = input_shape; SoftmaxPerDeviceState state = Kernels::Softmax::init_kernel( @@ -31,30 +34,22 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Softmax::backward_kernel( managed_stream.raw_stream(), - input_grad_accessor.get_float_ptr(), output_grad_accessor.get_float_ptr(), + input_grad_accessor.get_float_ptr(), output_grad_accessor.shape.num_elements()); - std::vector expected_input_grad_data = - std::vector(input_grad_accessor.shape.num_elements(), 1.0f); - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(host_input_grad_data == expected_input_grad_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index 7cc2b28c9e..34993fa151 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -1,6 +1,8 @@ #include "doctest/doctest.h" #include "kernels/split_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" +#include "utils/containers/repeat.h" using namespace ::FlexFlow; @@ -11,13 +13,17 @@ TEST_SUITE(FF_TEST_SUITE) { coord_t in_blk_size = 100; coord_t num_blks = 1; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({50}, DataType::FLOAT); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = @@ -46,8 +52,8 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 0.0f); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(0)); Kernels::Split::backward_kernel(managed_stream.raw_stream(), input_grad_accessor.get_float_ptr(), diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index 2fc186a257..0bc85cb8e0 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -7,9 +7,11 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Transpose Kernel Operations") { std::size_t num_dims = 2; - std::vector perm = {ff_dim_t(0), ff_dim_t(1)}; + std::vector perm = {ff_dim_t{0}, ff_dim_t{1}}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -18,41 +20,33 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Transpose::init_kernel(num_dims, perm); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10, 10}); + make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Transpose::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = create_random_filled_accessor_w(input_shape, allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); + output_grad_accessor, + input_grad_accessor); - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_grad_input_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index b591642570..bfed1241ba 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -1,105 +1,249 @@ #include "test_utils.h" +#include "op-attrs/tensor_shape.h" +#include -GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); - std::vector host_data(volume); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dist(-1.0f, 1.0f); - - for (auto &val : host_data) { - val = dist(gen); +namespace FlexFlow { + +GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape); + fill_with_zeros(result_accessor); + return result_accessor; +} + +TensorShape + make_tensor_shape_from_legion_dims(LegionOrdered const &dims, + DataType DT) { + return TensorShape{ + TensorDims{ + ff_ordered_from_legion_ordered(dims), + }, + DT, + }; +} + +template +struct CreateRandomFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, + Allocator &allocator) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); + + using T = real_type_t
; + T *data_ptr = src_accessor.get
(); + + std::random_device rd; + std::mt19937 gen(rd()); + size_t num_elements = get_num_elements(shape); + if constexpr (std::is_same::value) { + std::bernoulli_distribution dist(0.5); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } else if constexpr (std::is_floating_point::value) { + std::uniform_real_distribution dist(-1.0, 1.0); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } else if constexpr (std::is_integral::value) { + std::uniform_int_distribution dist(0, 100); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } + + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return dst_accessor; } +}; - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); +GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, + Allocator &allocator) { + return DataTypeDispatch1{}( + shape.data_type, shape, allocator); +} + +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_random_filled_accessor_w(shape, allocator); + + return read_only_accessor_from_write_accessor(accessor); +} + +template +struct FillWithZeros { + void operator()(GenericTensorAccessorW const &accessor) { + using T = real_type_t
; + + if (accessor.device_type == DeviceType::CPU) { + memset(accessor.ptr, 0, accessor.shape.get_volume() * sizeof(T)); + } else { + checkCUDA( + cudaMemset(accessor.ptr, 0, accessor.shape.get_volume() * sizeof(T))); + } } +}; - return accessor; +void fill_with_zeros(GenericTensorAccessorW const &accessor) { + DataTypeDispatch1{}(accessor.data_type, accessor); } -GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - float val, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); - std::vector host_data(volume, val); - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); +template +struct CPUAccessorRContainsNonZero { + bool operator()(GenericTensorAccessorR const &accessor) { + using T = real_type_t
; + + T const *data_ptr = accessor.get
(); + + for (size_t i = 0; i < accessor.shape.num_elements(); i++) { + if (data_ptr[i] != 0) { + return true; + } + } + + return false; } +}; - return accessor; +bool contains_non_zero(GenericTensorAccessorR const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); + return DataTypeDispatch1{}( + cpu_accessor.data_type, cpu_accessor); } -GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); - std::vector host_data(volume); +GenericTensorAccessorR + copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, + Allocator &cpu_allocator) { + GenericTensorAccessorR cpu_accessor = accessor; + if (accessor.device_type == DeviceType::GPU) { + cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator); + } + return cpu_accessor; +} - for (size_t i = 0; i < volume; i++) { - host_data[i] = i; +GenericTensorAccessorW + copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor, + Allocator &cpu_allocator) { + GenericTensorAccessorW cpu_accessor = accessor; + if (accessor.device_type == DeviceType::GPU) { + cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator); } + return cpu_accessor; +} + +template +struct Print2DCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + using T = real_type_t
; + + T const *data_ptr = accessor.get
(); + int rows = accessor.shape.at(legion_dim_t{0}); + int cols = accessor.shape.at(legion_dim_t{1}); - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + stream << data_ptr[i * cols + j]; + + if (j < cols - 1) { + stream << " "; + } + } + stream << std::endl; + } } +}; - return accessor; +void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); + DataTypeDispatch1{}( + accessor.data_type, accessor, stream); } -void fill_tensor_accessor_w(GenericTensorAccessorW accessor, - float val, - bool cpu_fill) { - LegionTensorDims dims = accessor.shape.dims; - size_t volume = accessor.shape.num_elements(); - std::vector host_data(volume, val); - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); +template +struct AccessorsAreEqual { + bool operator()(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor_a = + copy_accessor_r_to_cpu_if_necessary(accessor_a, cpu_allocator); + GenericTensorAccessorR cpu_accessor_b = + copy_accessor_r_to_cpu_if_necessary(accessor_b, cpu_allocator); + + using T = real_type_t
; + T const *a_data_ptr = cpu_accessor_a.get
(); + T const *b_data_ptr = cpu_accessor_b.get
(); + + for (size_t i = 0; i < accessor_a.shape.num_elements(); i++) { + if (a_data_ptr[i] != b_data_ptr[i]) { + return false; + } + } + + return true; + } +}; + +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + if (accessor_a.shape != accessor_b.shape) { + throw mk_runtime_error( + fmt::format("accessors_are_equal expected accessors to have the same " + "shape, but received: {} != {}", + accessor_a.shape, + accessor_b.shape)); } + return DataTypeDispatch1{}( + accessor_a.data_type, accessor_a, accessor_b); } -TensorShape make_float_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DataType::FLOAT, - }; +template +struct CreateFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + using T = real_type_t
; + if (!val.template has()) { + throw mk_runtime_error("create_filed_accessor expected data type of " + "shape and passed-in value to match"); + } + + auto unwrapped_value = val.get(); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); + + T *data_ptr = src_accessor.get
(); + for (size_t i = 0; i < dst_accessor.shape.num_elements(); i++) { + data_ptr[i] = unwrapped_value; + } + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + return dst_accessor; + } +}; + +GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + + return DataTypeDispatch1{}( + shape.data_type, shape, allocator, val); } -TensorShape make_double_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DataType::DOUBLE, - }; +GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + GenericTensorAccessorW w_accessor = + create_filled_accessor_w(shape, allocator, val); + return read_only_accessor_from_write_accessor(w_accessor); } +} // namespace FlexFlow diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index abce3fd444..d23b936cb0 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -1,48 +1,55 @@ #ifndef _FLEXFLOW_KERNELS_TEST_UTILS #define _FLEXFLOW_KERNELS_TEST_UTILS +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" #include "kernels/device.h" +#include "kernels/local_cpu_allocator.h" #include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" -#include +#include "op-attrs/datatype.h" +#include "op-attrs/datatype_value.dtg.h" + +namespace FlexFlow { GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill = false); + Allocator &allocator); -GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - float val, - bool cpu_fill = false); +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator); + +GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, + Allocator &allocator); + +TensorShape + make_tensor_shape_from_legion_dims(LegionOrdered const &dims, + DataType DT); -GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill = false); +bool contains_non_zero(GenericTensorAccessorR const &accessor); -void fill_tensor_accessor_w(GenericTensorAccessorW accessor, - float val, - bool cpu_fill = false); +void fill_with_zeros(GenericTensorAccessorW const &accessor); -TensorShape make_float_tensor_shape_from_legion_dims(FFOrdered dims); +GenericTensorAccessorW + copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor, + Allocator &allocator); -TensorShape make_double_tensor_shape_from_legion_dims(FFOrdered dims); +GenericTensorAccessorR + copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, + Allocator &allocator); -template -std::vector load_data_to_host_from_device(GenericTensorAccessorR accessor) { - int volume = accessor.shape.get_volume(); +void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor); - std::vector local_data(volume); - checkCUDA(cudaMemcpy(local_data.data(), - accessor.ptr, - local_data.size() * sizeof(T), - cudaMemcpyDeviceToHost)); - return local_data; -} +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b); -template -bool contains_non_zero(std::vector &data) { - return !all_of(data, [](T const &val) { return val == 0; }); -} +GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val); + +GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val); +} // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/local_cpu_allocator.h b/lib/local-execution/include/local-execution/local_cpu_allocator.h index d1e81facf2..cf6cfe35d1 100644 --- a/lib/local-execution/include/local-execution/local_cpu_allocator.h +++ b/lib/local-execution/include/local-execution/local_cpu_allocator.h @@ -12,6 +12,8 @@ struct LocalCPUAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: std::unordered_map> ptrs; }; diff --git a/lib/local-execution/include/local-execution/per_device_op_state.h b/lib/local-execution/include/local-execution/per_device_op_state.h index 1edd5b6360..f1f357a86e 100644 --- a/lib/local-execution/include/local-execution/per_device_op_state.h +++ b/lib/local-execution/include/local-execution/per_device_op_state.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H #define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H +#include "kernels/per_device_op_state.dtg.h" #include "local-execution/device_specific_device_states.dtg.h" -#include "local-execution/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 54c8dfc5f1..48584588e3 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H +#include "kernels/per_device_op_state.dtg.h" #include "local-execution/device_specific.h" #include "local-execution/itask_argument_accessor.h" -#include "local-execution/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h index 731e04fdc8..f697337c52 100644 --- a/lib/local-execution/include/local-execution/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -13,6 +13,9 @@ struct TrackedAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + + DeviceType get_allocation_device_type() const override; + size_t get_current_mem_usage(); private: diff --git a/lib/local-execution/src/local_cpu_allocator.cc b/lib/local-execution/src/local_cpu_allocator.cc index 4ca5f987a8..c4657e26b5 100644 --- a/lib/local-execution/src/local_cpu_allocator.cc +++ b/lib/local-execution/src/local_cpu_allocator.cc @@ -17,6 +17,10 @@ void LocalCPUAllocator::deallocate(void *ptr) { } } +DeviceType LocalCPUAllocator::get_allocation_device_type() const { + return DeviceType::CPU; +} + Allocator create_local_cpu_memory_allocator() { return Allocator::create(); } diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc index 54eca7e514..5d099c6b46 100644 --- a/lib/local-execution/src/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local_task_argument_accessor.cc @@ -24,8 +24,8 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( auto tensor_backing = std::get( this->tensor_slots_backing.at(slot_grad_pair)); if (priv == Permissions::RO) { - GenericTensorAccessorR readonly_tensor_backing = { - tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}; + GenericTensorAccessorR readonly_tensor_backing = + read_only_accessor_from_write_accessor(tensor_backing); return readonly_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { return tensor_backing; @@ -33,6 +33,7 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv)); } } + VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( slot_id_t slot, Permissions priv, IsGrad is_grad) const { SlotGradId slot_grad_pair = SlotGradId{slot, is_grad}; @@ -43,7 +44,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( for (GenericTensorAccessorW const &tensor_backing : variadic_tensor_backing) { readonly_variadic_tensor_backing.push_back( - {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}); + read_only_accessor_from_write_accessor(tensor_backing)); } return readonly_variadic_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc index 851566fc02..3aed3111c7 100644 --- a/lib/local-execution/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -133,9 +133,9 @@ static std::optional profiling, "[BatchNorm] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - output_grad.get_float_ptr(), output.get_float_ptr(), + output_grad.get_float_ptr(), + input.get_float_ptr(), input_grad.get_float_ptr(), scale.get_float_ptr(), scale_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc index 3e7baf49a9..e9adf88422 100644 --- a/lib/local-execution/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -54,9 +54,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { profiling, "[Cast] forward_time = {:.2lf}ms\n", input, - output, - input.data_type, - attrs.dtype); + output); } static std::optional @@ -73,9 +71,7 @@ static std::optional profiling, "[Cast] forward_time = {:.2lf}ms\n", input_grad, - output_grad, - input.data_type, - attrs.dtype); + output_grad); } TaskImplFunction get_cast_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc index d5c6e7f851..d7c5c22170 100644 --- a/lib/local-execution/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -108,8 +108,8 @@ static std::optional acc.get_argument(PER_DEVICE_STATE); auto attrs = acc.get_argument(ATTRS); - auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); auto filter = acc.get_tensor(FILTER); auto input_grad = acc.get_tensor_grad(INPUT); @@ -121,10 +121,10 @@ static std::optional profiling, "[Conv2d] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), filter.get_float_ptr(), filter_grad.get_float_ptr(), bias_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index 4ee609bd6c..10f1dce294 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -89,10 +89,10 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor_grad(INPUT); auto const &attrs = acc.get_argument(ATTRS); auto handle = acc.get_argument(HANDLE); @@ -107,10 +107,10 @@ static std::optional per_device_state, attrs, handle, - input, - input_grad, output, - output_grad); + output_grad, + input, + input_grad); } TaskImplFunction get_element_unary_init_task_impl() { diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc index 3fe5029fa1..8d998a8672 100644 --- a/lib/local-execution/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -41,15 +41,15 @@ static std::optional ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); return profile(backward_kernel, profiling, "[Flat] backward_time = {:.2lf}ms\n", input, - input_grad.get_float_ptr(), - output_grad.get_float_ptr()); + output_grad.get_float_ptr(), + input_grad.get_float_ptr()); } TaskImplFunction get_flat_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 9934e2a45c..b567937c70 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -125,17 +125,17 @@ static std::optional auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); - auto bias = acc.get_tensor(BIAS); + auto bias = acc.get_tensor(BIAS); auto input_grad = acc.get_tensor_grad(INPUT); auto weight_grad = acc.get_tensor_grad(WEIGHT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); - float const *bias_ptr = NULL; + float *bias_ptr = NULL; if (attrs.use_bias) { bias_ptr = bias.get_float_ptr(); } @@ -148,13 +148,13 @@ static std::optional profiling, "[Linear] backward_time = {:.2lf}ms\n", per_device_state, - (void *)input.get_float_ptr(), - (void *)input_grad.get_float_ptr(), - (void *)output.get_float_ptr(), - (void *)output_grad.get_float_ptr(), - (void *)weight.get_float_ptr(), - (void *)weight_grad.get_float_ptr(), - (void *)bias_ptr, + output.get_float_ptr(), + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + bias_ptr, in_dim, out_dim, batch_size); diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc index 33d62b713c..2e7fb8ce91 100644 --- a/lib/local-execution/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -30,14 +30,14 @@ static DeviceSpecificDeviceStates auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - int input_w = input.shape.at(ff_dim_t(0)) + 1; - int input_h = input.shape.at(ff_dim_t(1)) + 1; - int input_c = input.shape.at(ff_dim_t(2)) + 1; - int input_n = input.shape.at(ff_dim_t(3)) + 1; - int output_w = output.shape.at(ff_dim_t(0)) + 1; - int output_h = output.shape.at(ff_dim_t(1)) + 1; - int output_c = output.shape.at(ff_dim_t(2)) + 1; - int output_n = output.shape.at(ff_dim_t(3)) + 1; + int input_w = input.shape.at(ff_dim_t{0}) + 1; + int input_h = input.shape.at(ff_dim_t{1}) + 1; + int input_c = input.shape.at(ff_dim_t{2}) + 1; + int input_n = input.shape.at(ff_dim_t{3}) + 1; + int output_w = output.shape.at(ff_dim_t{0}) + 1; + int output_h = output.shape.at(ff_dim_t{1}) + 1; + int output_c = output.shape.at(ff_dim_t{2}) + 1; + int output_n = output.shape.at(ff_dim_t{3}) + 1; printf("init pool (input): n(%d) c(%d) h(%d) " "w(%d)\n", @@ -125,19 +125,19 @@ static std::optional Pool2DPerDeviceState state = acc.get_argument(PER_DEVICE_STATE); - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor(INPUT); return profile(backward_kernel, profiling, "[Pool2D] backward_time = {:.2lf}ms\n", state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), - output_grad.get_float_ptr()); + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr()); } TaskImplFunction get_pool_2d_init_task_impl() { diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc index a58d79a4f8..1e85d7186e 100644 --- a/lib/local-execution/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -64,13 +64,13 @@ static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); return profile(backward_kernel, profiling, "[Reduction] backward_time = {:.2lf}ms\n", - input_grad, - output_grad); + output_grad, + input_grad); } TaskImplFunction get_reduction_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc index 73692f4a13..655e1f238b 100644 --- a/lib/local-execution/src/ops/repartition.cc +++ b/lib/local-execution/src/ops/repartition.cc @@ -86,8 +86,8 @@ static std::optional ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(INPUT); + auto input_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc index 135475a711..56bbfdd371 100644 --- a/lib/local-execution/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -67,8 +67,8 @@ static std::optional return profile(backward_kernel, profiling, "[replicate] backward_time = {:.2lf}ms\n", - input_grad, output_grad, + input_grad, attrs.replicate_degree); } diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc index 7584d405eb..761718a9a7 100644 --- a/lib/local-execution/src/ops/reshape.cc +++ b/lib/local-execution/src/ops/reshape.cc @@ -87,8 +87,8 @@ static std::optional profiling, "[Reshape] backward time = {:.2lf}ms\n", per_device_state, - input_grad, - output_grad); + output_grad, + input_grad); } TaskImplFunction get_reshape_init_task_impl() { diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc index 366a579bea..bb1b802edd 100644 --- a/lib/local-execution/src/ops/reverse.cc +++ b/lib/local-execution/src/ops/reverse.cc @@ -53,11 +53,11 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1; for (int i = 0; i < output.shape.get_dim(); i++) { if (i < axis.value) { - in_blk_size *= output.shape.at(ff_dim_t(i)); + in_blk_size *= output.shape.at(ff_dim_t{i}); } else if (i == axis.value) { - reverse_dim_size = output.shape.at(ff_dim_t(i)); + reverse_dim_size = output.shape.at(ff_dim_t{i}); } else { - num_out_blks *= output.shape.at(ff_dim_t(i)); + num_out_blks *= output.shape.at(ff_dim_t{i}); } } @@ -83,11 +83,11 @@ static std::optional coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1; for (int i = 0; i < input_grad.shape.get_dim(); i++) { if (i < axis) { - in_blk_size *= input_grad.shape.at(ff_dim_t(i)); + in_blk_size *= input_grad.shape.at(ff_dim_t{i}); } else if (i == axis) { - reverse_dim_size = input_grad.shape.at(ff_dim_t(i)); + reverse_dim_size = input_grad.shape.at(ff_dim_t{i}); } else { - num_out_blks *= input_grad.shape.at(ff_dim_t(i)); + num_out_blks *= input_grad.shape.at(ff_dim_t{i}); } } diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc index 4c7979ae9b..9c5757112c 100644 --- a/lib/local-execution/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -102,8 +102,8 @@ static std::optional return profile(backward_kernel, profiling, "[SoftMax] backward_time = {:.2lf}ms\n", - input_grad.get_float_ptr(), output_grad.get_float_ptr(), + input_grad.get_float_ptr(), output_grad.shape.get_volume()); } diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc index 3e4ac15db3..0176e6d578 100644 --- a/lib/local-execution/src/ops/transpose.cc +++ b/lib/local-execution/src/ops/transpose.cc @@ -88,8 +88,8 @@ static std::optional profiling, "[Transpose] Backward_time = {:.2lf} [ms]", per_device_state, - input_grad, - output_grad); + output_grad, + input_grad); } OpTaskInvocation backward(TransposeAttrs const &attrs) { diff --git a/lib/local-execution/src/per_device_state.cc b/lib/local-execution/src/per_device_op_state.cc similarity index 100% rename from lib/local-execution/src/per_device_state.cc rename to lib/local-execution/src/per_device_op_state.cc diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index e6c3a11711..ed181aea32 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -23,8 +23,13 @@ size_t TrackedAllocator::get_current_mem_usage() { return this->current_mem_usage; } +DeviceType TrackedAllocator::get_allocation_device_type() const { + return this->allocator.get_allocation_device_type(); +} + Allocator get_tracked_memory_allocator(Allocator const &base_allocator) { - return Allocator::create(base_allocator); + Allocator allocator = Allocator::create(base_allocator); + return allocator; } } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index da3af6e3ad..512c1ef33b 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -12,7 +12,11 @@ // TEST_SUITE(FF_CUDA_TEST_SUITE) { // TEST_CASE("Local Cost Estimator") { // // local backing initialization -// ManagedPerDeviceFFHandle managed_handle{}; +// ManagedPerDeviceFFHandle managed_handle{ +/*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true +} +; // RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ // DeviceSpecific::create(managed_handle.raw_handle()), diff --git a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml index 27aa50f38f..2c524c120a 100644 --- a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml +++ b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml @@ -10,5 +10,8 @@ features = [ [[values]] name = "SUM" -[[value]] +[[values]] name = "AVG" + +[[values]] +name = "NONE" diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h new file mode 100644 index 0000000000..723e69bddd --- /dev/null +++ b/lib/op-attrs/include/op-attrs/datatype_value.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H + +#include "op-attrs/datatype_value.dtg.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value); +DataTypeValue make_double_data_type_value(double value); +DataTypeValue make_int32_data_type_value(int32_t value); +DataTypeValue make_int64_data_type_value(int64_t value); +DataTypeValue make_bool_data_type_value(bool value); + +} // namespace FlexFlow + +#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h index 6aa23d40fc..19a6e62178 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h +++ b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h @@ -175,8 +175,9 @@ auto inner_to_outer(FFOrdered const &ff_ordered) template std::vector inner_to_outer_idxs(FFOrdered const &ff_ordered) { std::vector idxs; - for (size_t i = 0; i < ff_ordered.size(); i++) { - idxs.push_back(ff_dim_t(ff_ordered.size() - i - 1)); + int size = static_cast(ff_ordered.size()); + for (int i = 0; i < ff_ordered.size(); i++) { + idxs.push_back(ff_dim_t{size - i - 1}); } return idxs; } diff --git a/lib/op-attrs/include/op-attrs/make_datatype_value.h b/lib/op-attrs/include/op-attrs/make_datatype_value.h new file mode 100644 index 0000000000..af4792dd9e --- /dev/null +++ b/lib/op-attrs/include/op-attrs/make_datatype_value.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H + +#include "op-attrs/datatype_value.dtg.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value); +DataTypeValue make_double_data_type_value(double value); +DataTypeValue make_int32_data_type_value(int32_t value); +DataTypeValue make_int64_data_type_value(int64_t value); +DataTypeValue make_bool_data_type_value(bool value); + +} // namespace FlexFlow + +#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H diff --git a/lib/op-attrs/src/op-attrs/make_datatype_value.cc b/lib/op-attrs/src/op-attrs/make_datatype_value.cc new file mode 100644 index 0000000000..76d712949a --- /dev/null +++ b/lib/op-attrs/src/op-attrs/make_datatype_value.cc @@ -0,0 +1,25 @@ +#include "op-attrs/make_datatype_value.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value) { + return DataTypeValue{value}; +} + +DataTypeValue make_double_data_type_value(double value) { + return DataTypeValue{value}; +} + +DataTypeValue make_int32_data_type_value(int32_t value) { + return DataTypeValue{value}; +} + +DataTypeValue make_int64_data_type_value(int64_t value) { + return DataTypeValue{value}; +} + +DataTypeValue make_bool_data_type_value(bool value) { + return DataTypeValue{value}; +} + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ops/attention.cc b/lib/op-attrs/src/op-attrs/ops/attention.cc index 483d832fee..8a806bcf9f 100644 --- a/lib/op-attrs/src/op-attrs/ops/attention.cc +++ b/lib/op-attrs/src/op-attrs/ops/attention.cc @@ -33,15 +33,15 @@ int get_oProjSize(MultiHeadAttentionAttrs const &attrs) { } int get_qSize(TensorShape const &query_shape) { - return dim_at_idx(query_shape, ff_dim_t(0)); + return dim_at_idx(query_shape, ff_dim_t{0}); } int get_kSize(TensorShape const &key_shape) { - return dim_at_idx(key_shape, ff_dim_t(0)); + return dim_at_idx(key_shape, ff_dim_t{0}); } int get_vSize(TensorShape const &value_shape) { - return dim_at_idx(value_shape, ff_dim_t(0)); + return dim_at_idx(value_shape, ff_dim_t{0}); } int get_qSize(MultiHeadAttentionParallelInputs const &inputs) { diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc index dcc567e0ca..6ea29b1855 100644 --- a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc +++ b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc @@ -138,7 +138,7 @@ std::unordered_set get_parallel_tensor_dim_indices(ParallelTensorShape const &shape) { std::unordered_set indices; extend(indices, transform(range(num_shard_dims(shape.dims)), [](int idx) { - return parallel_tensor_dim_idx_t(ff_dim_t(idx)); + return parallel_tensor_dim_idx_t(ff_dim_t{idx}); })); indices.insert(parallel_tensor_dim_idx_t(ReplicaType::SUM)); indices.insert(parallel_tensor_dim_idx_t(ReplicaType::DISCARD_COPY)); diff --git a/lib/pcg/include/pcg/metric.h b/lib/pcg/include/pcg/metric.h new file mode 100644 index 0000000000..718919112f --- /dev/null +++ b/lib/pcg/include/pcg/metric.h @@ -0,0 +1,72 @@ +#ifndef _FF_METRICS_H_ +#define _FF_METRICS_H_ + +#include "op-attrs/ops/loss_functions/loss_functions.h" +#include "utils/fmt.h" +#include + +namespace FlexFlow { + +enum class Metric { + ACCURACY, + CATEGORICAL_CROSSENTROPY, + SPARSE_CATEGORICAL_CROSSENTROPY, + MEAN_SQUARED_ERROR, + ROOT_MEAN_SQUARED_ERROR, + MEAN_ABSOLUTE_ERROR, +}; + +class MetricsAttrs { +public: + MetricsAttrs() = delete; + MetricsAttrs(LossFunction, std::vector const &); + +public: + LossFunction loss_type; + bool measure_accuracy; + bool measure_categorical_crossentropy; + bool measure_sparse_categorical_crossentropy; + bool measure_mean_squared_error; + bool measure_root_mean_squared_error; + bool measure_mean_absolute_error; +}; + +} // namespace FlexFlow + +namespace fmt { + +template <> +struct formatter<::FlexFlow::Metric> : formatter { + template + auto format(::FlexFlow::Metric m, FormatContext &ctx) const + -> decltype(ctx.out()) { + using namespace FlexFlow; + + string_view name = "unknown"; + switch (m) { + case Metric::ACCURACY: + name = "Accuracy"; + break; + case Metric::CATEGORICAL_CROSSENTROPY: + name = "CategoricalCrossEntropy"; + break; + case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: + name = "SparseCategoricalCrossEntropy"; + break; + case Metric::MEAN_SQUARED_ERROR: + name = "MeanSquaredError"; + break; + case Metric::ROOT_MEAN_SQUARED_ERROR: + name = "RootMeanSquaredError"; + break; + case Metric::MEAN_ABSOLUTE_ERROR: + name = "MeanAbsoluteError"; + break; + } + return formatter::format(name, ctx); + } +}; + +} // namespace fmt + +#endif diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc index dff647f5a1..65ef214669 100644 --- a/lib/pcg/src/pcg/computation_graph_builder.cc +++ b/lib/pcg/src/pcg/computation_graph_builder.cc @@ -3,6 +3,7 @@ #include "op-attrs/get_incoming_tensor_roles.h" #include "op-attrs/get_op_type.h" #include "op-attrs/get_output_shapes.h" +#include "op-attrs/make_datatype_value.h" #include "op-attrs/ops/attention.h" #include "op-attrs/ops/batch_norm.h" #include "op-attrs/ops/broadcast.h" @@ -609,14 +610,14 @@ tensor_guid_t ComputationGraphBuilder::batch_norm( TensorShape gamma_shape = throw_if_unexpected(get_gamma_weights_shape(attrs, input_shape)); - InitializerAttrs gamma_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}}; + InitializerAttrs gamma_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(1)}}; weights.push_back(make_weight_attrs(gamma_shape, gamma_initializer)); TensorShape beta_shape = throw_if_unexpected(get_beta_weights_shape(attrs, input_shape)); - InitializerAttrs beta_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}}; + InitializerAttrs beta_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(0)}}; weights.push_back(make_weight_attrs(beta_shape, beta_initializer)); } @@ -688,8 +689,8 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention( get_input_bias_shape(attrs, query_shape, key_shape, value_shape)); // initializer chosen based on // https://github.com/pytorch/pytorch/blob/31c4e0d37d8efc37a0697159e5b9121ec34d5141/torch/nn/modules/activation.py#L1120-L1121 - InitializerAttrs input_bias_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}}; + InitializerAttrs input_bias_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(0)}}; weights.push_back( make_weight_attrs(input_bias_shape, input_bias_initializer)); @@ -698,8 +699,8 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention( get_output_bias_shape(attrs, query_shape, key_shape, value_shape)); // initializer chosen based on // https://github.com/pytorch/pytorch/blob/31c4e0d37d8efc37a0697159e5b9121ec34d5141/torch/nn/modules/activation.py#L1120-L1121 - InitializerAttrs output_bias_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}}; + InitializerAttrs output_bias_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(0)}}; weights.push_back( make_weight_attrs(output_bias_shape, output_bias_initializer)); @@ -870,14 +871,14 @@ tensor_guid_t ComputationGraphBuilder::layer_norm( TensorShape gamma_shape = throw_if_unexpected(get_gamma_weights_shape(attrs, input_shape)); - InitializerAttrs gamma_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}}; + InitializerAttrs gamma_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(1)}}; weights.push_back(make_weight_attrs(gamma_shape, gamma_initializer)); TensorShape beta_shape = throw_if_unexpected(get_beta_weights_shape(attrs, input_shape)); - InitializerAttrs beta_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}}; + InitializerAttrs beta_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(0)}}; weights.push_back(make_weight_attrs(beta_shape, beta_initializer)); } diff --git a/lib/pcg/src/pcg/metric.cc b/lib/pcg/src/pcg/metric.cc new file mode 100644 index 0000000000..69aba90d12 --- /dev/null +++ b/lib/pcg/src/pcg/metric.cc @@ -0,0 +1,38 @@ +#include "pcg/metric.h" + +namespace FlexFlow { +MetricsAttrs::MetricsAttrs(LossFunction _loss_type, + std::vector const &metrics) + : loss_type(_loss_type), measure_accuracy(false), + measure_categorical_crossentropy(false), + measure_sparse_categorical_crossentropy(false), + measure_mean_squared_error(false), measure_root_mean_squared_error(false), + measure_mean_absolute_error(false) { + for (Metric const &m : metrics) { + switch (m) { + case Metric::ACCURACY: + measure_accuracy = true; + continue; + case Metric::CATEGORICAL_CROSSENTROPY: + measure_categorical_crossentropy = true; + continue; + case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: + measure_sparse_categorical_crossentropy = true; + continue; + case Metric::MEAN_SQUARED_ERROR: + measure_mean_squared_error = true; + continue; + case Metric::ROOT_MEAN_SQUARED_ERROR: + measure_root_mean_squared_error = true; + continue; + case Metric::MEAN_ABSOLUTE_ERROR: + measure_mean_absolute_error = true; + continue; + default: + throw mk_runtime_error( + "Initializing MetricsAttrs with unrecogonized metrics type"); + } + } +} + +} // namespace FlexFlow diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc index f33b4dcd17..79ac43ae66 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc @@ -1,5 +1,6 @@ #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" #include "op-attrs/get_incoming_tensor_roles.h" +#include "op-attrs/make_datatype_value.h" #include "op-attrs/ops/attention.h" #include "op-attrs/ops/batch_matmul.h" #include "op-attrs/ops/batch_norm.h" @@ -385,14 +386,14 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::batch_norm( ParallelTensorShape gamma_shape = throw_if_unexpected(get_gamma_weights_shape(attrs, input_shape)); - InitializerAttrs gamma_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}}; + InitializerAttrs gamma_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(1)}}; weights.push_back(make_weight_attrs(gamma_shape, gamma_initializer)); ParallelTensorShape beta_shape = throw_if_unexpected(get_beta_weights_shape(attrs, input_shape)); - InitializerAttrs beta_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}}; + InitializerAttrs beta_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(0)}}; weights.push_back(make_weight_attrs(beta_shape, beta_initializer)); } diff --git a/lib/runtime/src/metrics_functions.cc b/lib/runtime/src/metrics_functions.cc index feb6e704b2..33e15baed2 100644 --- a/lib/runtime/src/metrics_functions.cc +++ b/lib/runtime/src/metrics_functions.cc @@ -25,39 +25,6 @@ namespace FlexFlow { LegionRuntime::Logger::Category log_metrics("metrics"); -MetricsAttrs::MetricsAttrs(LossFunction _loss_type, - std::vector const &metrics) - : loss_type(_loss_type), measure_accuracy(false), - measure_categorical_crossentropy(false), - measure_sparse_categorical_crossentropy(false), - measure_mean_squared_error(false), measure_root_mean_squared_error(false), - measure_mean_absolute_error(false) { - for (Metric const &m : metrics) { - switch (m) { - case Metric::ACCURACY: - measure_accuracy = true; - continue; - case Metric::CATEGORICAL_CROSSENTROPY: - measure_categorical_crossentropy = true; - continue; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - measure_sparse_categorical_crossentropy = true; - continue; - case Metric::MEAN_SQUARED_ERROR: - measure_mean_squared_error = true; - continue; - case Metric::ROOT_MEAN_SQUARED_ERROR: - measure_root_mean_squared_error = true; - continue; - case Metric::MEAN_ABSOLUTE_ERROR: - measure_mean_absolute_error = true; - continue; - default: - throw mk_runtime_error("Unrecogonized metrics type {}", m); - } - } -} - enum Slots { LOGIT, LABEL, diff --git a/lib/runtime/src/metrics_functions.h b/lib/runtime/src/metrics_functions.h index fbb0b633bf..73dc3bbc51 100644 --- a/lib/runtime/src/metrics_functions.h +++ b/lib/runtime/src/metrics_functions.h @@ -16,38 +16,13 @@ #ifndef _FF_METRICS_FUNCTIONS_H_ #define _FF_METRICS_FUNCTIONS_H_ +#include "kernels/metric.h" #include "kernels/perf_metrics.h" #include "legion.h" -#include "op-attrs/ops/loss_functions.h" #include "task_spec/task_invocation.h" -#include "utils/fmt.h" namespace FlexFlow { -enum class Metric { - ACCURACY, - CATEGORICAL_CROSSENTROPY, - SPARSE_CATEGORICAL_CROSSENTROPY, - MEAN_SQUARED_ERROR, - ROOT_MEAN_SQUARED_ERROR, - MEAN_ABSOLUTE_ERROR, -}; - -class MetricsAttrs { -public: - MetricsAttrs() = delete; - MetricsAttrs(LossFunction, std::vector const &); - -public: - LossFunction loss_type; - bool measure_accuracy; - bool measure_categorical_crossentropy; - bool measure_sparse_categorical_crossentropy; - bool measure_mean_squared_error; - bool measure_root_mean_squared_error; - bool measure_mean_absolute_error; -}; - TypedIndexTaskInvocation compute_metrics(MetricsAttrs const &, parallel_tensor_guid_t const &logit, @@ -79,40 +54,4 @@ VISITABLE_STRUCT(::FlexFlow::MetricsAttrs, measure_root_mean_squared_error, measure_mean_absolute_error); -namespace fmt { - -template <> -struct formatter<::FlexFlow::Metric> : formatter { - template - auto format(::FlexFlow::Metric m, FormatContext &ctx) const - -> decltype(ctx.out()) { - using namespace FlexFlow; - - string_view name = "unknown"; - switch (m) { - case Metric::ACCURACY: - name = "Accuracy"; - break; - case Metric::CATEGORICAL_CROSSENTROPY: - name = "CategoricalCrossEntropy"; - break; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - name = "SparseCategoricalCrossEntropy"; - break; - case Metric::MEAN_SQUARED_ERROR: - name = "MeanSquaredError"; - break; - case Metric::ROOT_MEAN_SQUARED_ERROR: - name = "RootMeanSquaredError"; - break; - case Metric::MEAN_ABSOLUTE_ERROR: - name = "MeanAbsoluteError"; - break; - } - return formatter::format(name, ctx); - } -}; - -} // namespace fmt - #endif diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc index 2370739d58..f34751ef8d 100644 --- a/lib/runtime/src/ops/embedding.cc +++ b/lib/runtime/src/ops/embedding.cc @@ -77,15 +77,15 @@ static std::optional return profile(backward_kernel, profiling, "[Embedding] backward_time = {:.2lf}ms\n", - input, output, + input, weight_grad, - input.data_type, output.data_type, + input.data_type, attrs.aggr, input.shape.get_dim(), output.shape.get_dim(), - input.shape.at(ff_dim_t(0))); + input.shape.at(ff_dim_t{0})); } TaskImplFunction get_embedding_fwd_task_impl() {