From 05144e58ed0ad434ed93388ce15ca723a075712b Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Thu, 11 Jul 2024 14:33:42 -0700 Subject: [PATCH 01/20] test_utils refactor, local_cpu_allocator --- .../include/kernels/local_cpu_allocator.h | 22 ++++ lib/kernels/src/local_cpu_allocator.cc | 35 ++++++ lib/kernels/test/src/test_attention_kernel.cc | 37 +++--- .../test/src/test_batch_matmul_kernel.cc | 14 +-- .../test/src/test_batch_norm_kernel.cc | 36 +++--- lib/kernels/test/src/test_cast_kernel.cc | 8 +- lib/kernels/test/src/test_combine_kernel.cc | 6 +- lib/kernels/test/src/test_concat_kernel.cc | 9 +- lib/kernels/test/src/test_dropout.cc | 8 +- lib/kernels/test/src/test_flat_kernel.cc | 9 +- lib/kernels/test/src/test_gather_kernels.cc | 14 ++- .../test/src/test_layer_norm_kernels.cc | 15 +-- lib/kernels/test/src/test_partition_kernel.cc | 8 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 16 +-- lib/kernels/test/src/test_reduction_kernel.cc | 10 +- lib/kernels/test/src/test_replicate_kernel.cc | 9 +- lib/kernels/test/src/test_reshape_kernel.cc | 9 +- lib/kernels/test/src/test_reverse_kernels.cc | 9 +- lib/kernels/test/src/test_softmax_kernel.cc | 9 +- lib/kernels/test/src/test_split_kernel.cc | 12 +- lib/kernels/test/src/test_transpose_kernel.cc | 8 +- lib/kernels/test/src/test_utils.cc | 105 ------------------ lib/kernels/test/src/test_utils.h | 88 ++++++++++++--- 23 files changed, 266 insertions(+), 230 deletions(-) create mode 100644 lib/kernels/include/kernels/local_cpu_allocator.h create mode 100644 lib/kernels/src/local_cpu_allocator.cc delete mode 100644 lib/kernels/test/src/test_utils.cc diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h new file mode 100644 index 0000000000..27dcc9d854 --- /dev/null +++ b/lib/kernels/include/kernels/local_cpu_allocator.h @@ -0,0 +1,22 @@ +#include "kernels/allocation.h" +#include + +namespace FlexFlow { + +struct LocalCPUAllocator : public IAllocator { + LocalCPUAllocator() = default; + LocalCPUAllocator(LocalCPUAllocator const &) = delete; + LocalCPUAllocator(LocalCPUAllocator &&) = delete; + ~LocalCPUAllocator() override; + + void *allocate(size_t) override; + void deallocate(void *) override; + +private: + std::unordered_set ptrs; +}; +CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator); + +Allocator create_local_cpu_memory_allocator(); + +} // namespace FlexFlow diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc new file mode 100644 index 0000000000..6553dc2f88 --- /dev/null +++ b/lib/kernels/src/local_cpu_allocator.cc @@ -0,0 +1,35 @@ +#include "kernels/local_cpu_allocator.h" +#include "kernels/device.h" + +namespace FlexFlow { +void *LocalCPUAllocator::allocate(size_t requested_memory_size) { + void *ptr = malloc(requested_memory_size); + if (ptr != nullptr) { + this->ptrs.insert(ptr); + } else { + throw std::bad_alloc(); + } + return ptr; +} + +void LocalCPUAllocator::deallocate(void *ptr) { + if (contains(this->ptrs, ptr)) { + free(ptr); + this->ptrs.erase(ptr); + } else { + throw std::runtime_error( + "Deallocating a pointer that was not allocated by this Allocator"); + } +} + +LocalCPUAllocator::~LocalCPUAllocator() { + for (auto ptr : ptrs) { + free(ptr); + } +} + +Allocator create_local_cpu_memory_allocator() { + return Allocator::create(); +} + +} // namespace FlexFlow diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index d44129ece1..1f06e3ffd7 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -33,25 +33,28 @@ TEST_SUITE(FF_TEST_SUITE) { kvSeqLength, false); - TensorShape query_shape = make_float_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, qSize}); - TensorShape key_shape = make_float_tensor_shape_from_legion_dims( + TensorShape query_shape = + make_tensor_shape_from_legion_dims( + {qoSeqLength, num_samples, qSize}); + TensorShape key_shape = make_tensor_shape_from_legion_dims( {kvSeqLength, num_samples, kSize}); - TensorShape value_shape = make_float_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, vSize}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, oProjSize}); + TensorShape value_shape = + make_tensor_shape_from_legion_dims( + {kvSeqLength, num_samples, vSize}); + TensorShape output_shape = + make_tensor_shape_from_legion_dims( + {qoSeqLength, num_samples, oProjSize}); TensorShape weight_shape = - make_float_tensor_shape_from_legion_dims({state.weightSize}); + make_tensor_shape_from_legion_dims({state.weightSize}); GenericTensorAccessorW query_accessor = - create_random_filled_accessor_w(query_shape, allocator); + create_random_filled_accessor_w(query_shape, allocator); GenericTensorAccessorW key_accessor = - create_random_filled_accessor_w(key_shape, allocator); + create_random_filled_accessor_w(key_shape, allocator); GenericTensorAccessorW value_accessor = - create_random_filled_accessor_w(value_shape, allocator); + create_random_filled_accessor_w(value_shape, allocator); GenericTensorAccessorW weight_accessor = - create_random_filled_accessor_w(weight_shape, allocator); + create_random_filled_accessor_w(weight_shape, allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -73,15 +76,15 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW query_grad_accessor = - create_random_filled_accessor_w(query_shape, allocator); + create_random_filled_accessor_w(query_shape, allocator); GenericTensorAccessorW key_grad_accessor = - create_random_filled_accessor_w(key_shape, allocator); + create_random_filled_accessor_w(key_shape, allocator); GenericTensorAccessorW value_grad_accessor = - create_random_filled_accessor_w(value_shape, allocator); + create_random_filled_accessor_w(value_shape, allocator); GenericTensorAccessorW weight_grad_accessor = - create_random_filled_accessor_w(weight_shape, allocator); + create_random_filled_accessor_w(weight_shape, allocator); GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); Kernels::MultiHeadAttention::backward_kernel( managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index 18e6977148..a1a0eee27e 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -20,18 +20,18 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape_a = - make_float_tensor_shape_from_legion_dims({m, k, batch}); + make_tensor_shape_from_legion_dims({m, k, batch}); TensorShape input_shape_b = - make_float_tensor_shape_from_legion_dims({k, n, batch}); + make_tensor_shape_from_legion_dims({k, n, batch}); TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({m, n, batch}); + make_tensor_shape_from_legion_dims({m, n, batch}); GenericTensorAccessorW a_accessor = - create_random_filled_accessor_w(input_shape_a, allocator); + create_random_filled_accessor_w(input_shape_a, allocator); GenericTensorAccessorW b_accessor = - create_random_filled_accessor_w(input_shape_b, allocator); + create_random_filled_accessor_w(input_shape_b, allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { Kernels::BatchMatmul::forward_kernel(managed_stream.raw_stream(), @@ -50,7 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW o_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW a_grad_accessor = allocator.allocate_tensor(input_shape_a); GenericTensorAccessorW b_grad_accessor = diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 8487bbda6a..4cddf19c14 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -23,25 +23,29 @@ TEST_SUITE(FF_TEST_SUITE) { output_w, true); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape scale_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape bias_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}); + TensorShape output_shape = + make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}); + TensorShape scale_shape = + make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}); + TensorShape bias_shape = + make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW scale_accessor = - create_filled_accessor_w(scale_shape, allocator, 1.0f); + create_filled_accessor_w(scale_shape, allocator, 1.0f); SUBCASE("forward_kernel") { GenericTensorAccessorW bias_accessor = - create_filled_accessor_w(bias_shape, allocator, 0.0f); + create_filled_accessor_w(bias_shape, allocator, 0.0f); Kernels::BatchNorm::forward_kernel(managed_stream.raw_stream(), state, @@ -58,13 +62,13 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW scale_grad_accessor = - create_random_filled_accessor_w(scale_shape, allocator); + create_random_filled_accessor_w(scale_shape, allocator); GenericTensorAccessorW bias_grad_accessor = - create_random_filled_accessor_w(bias_shape, allocator); + create_random_filled_accessor_w(bias_shape, allocator); Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 004bc9c32f..60f5a9d2d6 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -12,17 +12,17 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}); TensorShape output_shape = - make_double_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); Kernels::Cast::forward_kernel(managed_stream.raw_stream(), input_accessor, diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 2e1000cb95..7df6a09e0e 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -11,13 +11,13 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -33,7 +33,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_w(output_shape, allocator)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index bf2a521b4e..46c1b894be 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -13,9 +13,10 @@ TEST_SUITE(FF_TEST_SUITE) { ManagedFFStream managed_stream{}; TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({size_per_input}); + make_tensor_shape_from_legion_dims({size_per_input}); TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({size_per_input, num_inputs}); + make_tensor_shape_from_legion_dims( + {size_per_input, num_inputs}); Allocator allocator = create_local_cuda_memory_allocator(); @@ -23,7 +24,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector input_accessors = repeat(num_inputs, [&]() { return read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); }); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -43,7 +44,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_w(output_shape, allocator)); std::vector input_grad_accessors = repeat( num_inputs, [&]() { return allocator.allocate_tensor(input_shape); }); diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 981bc611d8..9e4d759eb9 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10, 10}); + make_tensor_shape_from_legion_dims({10, 10}); TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; @@ -32,7 +32,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -50,9 +50,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_data = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_data = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Dropout::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 70894858e3..71c1bca3d2 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -10,12 +10,13 @@ TEST_SUITE(FF_TEST_SUITE) { ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}); TensorShape output_shape = input_shape; GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 2.0f)); + create_filled_accessor_w(input_shape, allocator, 2.0f)); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -36,9 +37,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 0.0f); + create_filled_accessor_w(output_shape, allocator, 0.0f); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); + create_filled_accessor_w(input_shape, allocator, 1.0f); Kernels::Flat::backward_kernel(managed_stream.raw_stream(), input_accessor, diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 88ac2f6889..b6c611e231 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -12,17 +12,19 @@ TEST_SUITE(FF_TEST_SUITE) { GatherPerDeviceState state = {managed_handle.raw_handle(), legion_dim_t(2)}; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({50}); GenericTensorAccessorR index_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_w(output_shape, allocator)); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -41,9 +43,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_w(output_shape, allocator)); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Gather::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 03b2f56bb9..b0cd1ffa78 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -12,10 +12,11 @@ TEST_SUITE(FF_TEST_SUITE) { bool elementwise_affine = true; TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({batch_size, feature_size}); + make_tensor_shape_from_legion_dims( + {batch_size, feature_size}); TensorShape output_shape = input_shape; TensorShape feature_shape = - make_float_tensor_shape_from_legion_dims({feature_size}); + make_tensor_shape_from_legion_dims({feature_size}); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -32,15 +33,15 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); GenericTensorAccessorW gamma_accessor = - create_filled_accessor_w(feature_shape, allocator, 1.0f); + create_filled_accessor_w(feature_shape, allocator, 1.0f); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); GenericTensorAccessorW beta_accessor = - create_filled_accessor_w(feature_shape, allocator, 0.0f); + create_filled_accessor_w(feature_shape, allocator, 0.0f); Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(), state, @@ -53,9 +54,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_w(output_shape, allocator)); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW gamma_grad_accessor = allocator.allocate_tensor(feature_shape); GenericTensorAccessorW beta_grad_accessor = diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 437b37e954..dd525eb3ee 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -15,13 +15,13 @@ TEST_SUITE(FF_TEST_SUITE) { managed_handle.raw_handle(), DataType::FLOAT); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10, 10}); + make_tensor_shape_from_legion_dims({10, 10}); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_filled_accessor_w(input_shape, allocator, 1.0f)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -40,9 +40,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_filled_accessor_w(output_shape, allocator, 1.0f)); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + create_filled_accessor_w(input_shape, allocator, 2.0f); Kernels::Repartition::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index ebb92d39db..b0756b82a3 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -36,15 +36,17 @@ TEST_SUITE(FF_TEST_SUITE) { stride_w, pool_type); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {input_w, input_h, input_c, input_n}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {output_w, output_h, output_c, output_n}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims( + {input_w, input_h, input_c, input_n}); + TensorShape output_shape = + make_tensor_shape_from_legion_dims( + {output_w, output_h, output_c, output_n}); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { Kernels::Pool2D::forward_kernel(managed_stream.raw_stream(), @@ -60,7 +62,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + create_filled_accessor_w(output_shape, allocator, 1.0f); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 1ea740f336..d2483d28ed 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -8,7 +8,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::size_t num_replicas = 5; TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10, 10, 10, 10, 10}); + make_tensor_shape_from_legion_dims( + {10, 10, 10, 10, 10}); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -16,11 +17,12 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({10}); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({10}); GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -40,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR output_grad_accessor = read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_filled_accessor_w(output_shape, allocator, 1.0f)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 86d790f03c..6a4c03c5c5 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -7,7 +7,8 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Replicate Kernel") { std::size_t num_replicas = 10; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}); TensorShape output_shape = input_shape; ManagedPerDeviceFFHandle managed_handle{}; @@ -18,7 +19,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_filled_accessor_w(input_shape, allocator, 1.0f)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -36,10 +37,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); + create_filled_accessor_w(input_shape, allocator, 1.0f); GenericTensorAccessorR output_grad_accessor = read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_filled_accessor_w(output_shape, allocator, 1.0f)); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), input_grad_accessor, diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index f56bfacc2b..0d36b9796c 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -10,7 +10,8 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}); TensorShape output_shape = input_shape; ReshapePerDeviceState state = @@ -19,7 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_filled_accessor_w(input_shape, allocator, 1.0f)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -38,9 +39,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_filled_accessor_w(output_shape, allocator, 1.0f)); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + create_filled_accessor_w(input_shape, allocator, 2.0f); Kernels::Reshape::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index cdaf65a305..0bbcec75a7 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -9,7 +9,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::size_t in_blk_size = 10; std::size_t num_out_blks = 1; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}); TensorShape output_shape = input_shape; ManagedPerDeviceFFHandle managed_handle{}; @@ -20,7 +21,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_filled_accessor_w(input_shape, allocator, 1.0f)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -40,9 +41,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Reverse::backward_kernel( managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index f49c1ebbcc..d541395066 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -13,18 +13,19 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}); TensorShape output_shape = input_shape; SoftmaxPerDeviceState state = Kernels::Softmax::init_kernel( managed_handle.raw_handle(), 0, input_n, channels, input_h, input_w); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Softmax::forward_kernel(managed_stream.raw_stream(), state, @@ -39,7 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + create_filled_accessor_w(output_shape, allocator, 1.0f); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index 7cc2b28c9e..c4685921a9 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -16,12 +16,14 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50}); + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100}); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({50}); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); std::vector output_ptrs = repeat(num_outputs, [&]() { GenericTensorAccessorW output_accessor = @@ -42,12 +44,12 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector output_grad_ptrs(num_outputs); for (int i = 0; i < num_outputs; i++) { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 0.0f); + create_filled_accessor_w(input_shape, allocator, 0.0f); Kernels::Split::backward_kernel(managed_stream.raw_stream(), input_grad_accessor.get_float_ptr(), diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index 2fc186a257..c29e3ceb2c 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -18,13 +18,13 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Transpose::init_kernel(num_dims, perm); TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10, 10}); + make_tensor_shape_from_legion_dims({10, 10}); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -40,9 +40,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_w(output_shape, allocator)); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc deleted file mode 100644 index b591642570..0000000000 --- a/lib/kernels/test/src/test_utils.cc +++ /dev/null @@ -1,105 +0,0 @@ -#include "test_utils.h" - -GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); - std::vector host_data(volume); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dist(-1.0f, 1.0f); - - for (auto &val : host_data) { - val = dist(gen); - } - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } - - return accessor; -} - -GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - float val, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); - std::vector host_data(volume, val); - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } - - return accessor; -} - -GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); - std::vector host_data(volume); - - for (size_t i = 0; i < volume; i++) { - host_data[i] = i; - } - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } - - return accessor; -} - -void fill_tensor_accessor_w(GenericTensorAccessorW accessor, - float val, - bool cpu_fill) { - LegionTensorDims dims = accessor.shape.dims; - size_t volume = accessor.shape.num_elements(); - std::vector host_data(volume, val); - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } -} - -TensorShape make_float_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DataType::FLOAT, - }; -} - -TensorShape make_double_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DataType::DOUBLE, - }; -} diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index abce3fd444..d78f3b377a 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -2,40 +2,102 @@ #define _FLEXFLOW_KERNELS_TEST_UTILS #include "kernels/device.h" +#include "kernels/local_cpu_allocator.h" #include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" #include +template GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, Allocator &allocator, - bool cpu_fill = false); + bool cpu_fill = false) { + GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + size_t volume = accessor.shape.num_elements(); + std::vector
host_data(volume); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution
dist(-1.0f, 1.0f); + for (auto &val : host_data) { + val = dist(gen); + } + + if (cpu_fill) { + memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT)); + } else { + checkCUDA(cudaMemcpy(accessor.ptr, + host_data.data(), + host_data.size() * sizeof(DT), + cudaMemcpyHostToDevice)); + } + + return accessor; +} + +template GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, Allocator &allocator, - float val, - bool cpu_fill = false); + DT val, + bool cpu_fill = false) { + GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + size_t volume = accessor.shape.num_elements(); + std::vector
host_data(volume, val); + + if (cpu_fill) { + memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT)); + } else { + checkCUDA(cudaMemcpy(accessor.ptr, + host_data.data(), + host_data.size() * sizeof(DT), + cudaMemcpyHostToDevice)); + } + return accessor; +} + +template GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, Allocator &allocator, - bool cpu_fill = false); + bool cpu_fill = false) { + GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + size_t volume = accessor.shape.num_elements(); + std::vector
host_data(volume); -void fill_tensor_accessor_w(GenericTensorAccessorW accessor, - float val, - bool cpu_fill = false); + for (size_t i = 0; i < volume; i++) { + host_data[i] = i; + } -TensorShape make_float_tensor_shape_from_legion_dims(FFOrdered dims); + if (cpu_fill) { + memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT)); + } else { + checkCUDA(cudaMemcpy(accessor.ptr, + host_data.data(), + host_data.size() * sizeof(DT), + cudaMemcpyHostToDevice)); + } -TensorShape make_double_tensor_shape_from_legion_dims(FFOrdered dims); + return accessor; +} -template -std::vector load_data_to_host_from_device(GenericTensorAccessorR accessor) { +template +TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims) { + return TensorShape{ + TensorDims{ + dims, + }, + DT, + }; +} + +template +std::vector
load_data_to_host_from_device(GenericTensorAccessorR accessor) { int volume = accessor.shape.get_volume(); - std::vector local_data(volume); + std::vector
local_data(volume); checkCUDA(cudaMemcpy(local_data.data(), accessor.ptr, - local_data.size() * sizeof(T), + local_data.size() * sizeof(DT), cudaMemcpyDeviceToHost)); return local_data; } From 3bb8ff61290c0868fc708bd1bf0331b975f3e05e Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Fri, 12 Jul 2024 12:54:48 -0700 Subject: [PATCH 02/20] test utils modification, cast, reverse, and replicate cpu kernels --- .../include/kernels/cast_kernels_cpu.h | 27 ++++ .../include/kernels/replicate_kernels_cpu.h | 24 +++ .../include/kernels/reverse_kernels_cpu.h | 29 ++++ lib/kernels/src/cpu/cast_kernels.cc | 59 ++++++++ lib/kernels/src/cpu/combine_kernels.cc | 0 lib/kernels/src/cpu/replicate_kernels.cc | 61 ++++++++ lib/kernels/src/cpu/reverse_kernels.cc | 49 +++++++ lib/kernels/src/cuda/ops/reverse_kernels.cu | 36 ++++- lib/kernels/test/src/test_attention_kernel.cc | 20 +-- .../test/src/test_batch_matmul_kernel.cc | 8 +- .../test/src/test_batch_norm_kernel.cc | 23 ++- lib/kernels/test/src/test_cast_kernel.cc | 64 +++++++- lib/kernels/test/src/test_combine_kernel.cc | 11 +- lib/kernels/test/src/test_concat_kernel.cc | 9 +- lib/kernels/test/src/test_dropout.cc | 8 +- lib/kernels/test/src/test_flat_kernel.cc | 4 +- lib/kernels/test/src/test_gather_kernels.cc | 15 +- .../test/src/test_layer_norm_kernels.cc | 6 +- lib/kernels/test/src/test_partition_kernel.cc | 4 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 11 +- lib/kernels/test/src/test_reduction_kernel.cc | 9 +- lib/kernels/test/src/test_replicate_kernel.cc | 94 +++++++++++- lib/kernels/test/src/test_reshape_kernel.cc | 4 +- lib/kernels/test/src/test_reverse_kernels.cc | 119 ++++++++++++++- lib/kernels/test/src/test_softmax_kernel.cc | 11 +- lib/kernels/test/src/test_split_kernel.cc | 4 +- lib/kernels/test/src/test_transpose_kernel.cc | 13 +- lib/kernels/test/src/test_utils.cc | 24 +++ lib/kernels/test/src/test_utils.h | 138 +++++++++++------- 29 files changed, 728 insertions(+), 156 deletions(-) create mode 100644 lib/kernels/include/kernels/cast_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/replicate_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/reverse_kernels_cpu.h create mode 100644 lib/kernels/src/cpu/cast_kernels.cc create mode 100644 lib/kernels/src/cpu/combine_kernels.cc create mode 100644 lib/kernels/src/cpu/replicate_kernels.cc create mode 100644 lib/kernels/src/cpu/reverse_kernels.cc create mode 100644 lib/kernels/test/src/test_utils.cc diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h new file mode 100644 index 0000000000..df4ef22b93 --- /dev/null +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -0,0 +1,27 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H + +#include "device.h" +#include "kernels/accessor.h" + +namespace FlexFlow { +namespace Kernels { +namespace Cast { +namespace CPU { + +void forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type); + +void backward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type); + +} // namespace CPU +} // namespace Cast +} // namespace Kernels +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h new file mode 100644 index 0000000000..4bc97f00ef --- /dev/null +++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h @@ -0,0 +1,24 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H + +#include "device.h" +#include "kernels/accessor.h" + +namespace FlexFlow { +namespace Kernels { +namespace Replicate { +namespace CPU { + +void forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void backward_kernel(GenericTensorAccessorW const &input, + GenericTensorAccessorR const &output, + size_t num_replicas); + +} // namespace CPU +} // namespace Replicate +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h new file mode 100644 index 0000000000..89ed6ffdb4 --- /dev/null +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -0,0 +1,29 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H + +#include "device.h" + +namespace FlexFlow { +namespace Kernels { +namespace Reverse { +namespace CPU { + +void forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t output_size); + +void backward_kernel(float const *out_grad_ptr, + float *in_grad_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t input_size); +} // namespace CPU +} // namespace Reverse +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc new file mode 100644 index 0000000000..cf73a84b93 --- /dev/null +++ b/lib/kernels/src/cpu/cast_kernels.cc @@ -0,0 +1,59 @@ +#include "kernels/cast_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow { +namespace Kernels { +namespace Cast { +namespace CPU { + +template +void cast_forward(IDT const *input, ODT *output, size_t volume) { + for (size_t i = 0; i < volume; ++i) { + output[i] = static_cast(input[i]); + } +} + +template +void cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) { + for (size_t i = 0; i < volume; i++) { + output[i] = static_cast(input[i]) + beta * output[i]; + } +} + +template +struct ForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + size_t volume = input.shape.get_volume(); + cast_forward(input.get(), output.get(), volume); + } +}; + +template +struct BackwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + size_t volume = input.shape.get_volume(); + cast_backward( + input.get(), output.get(), volume, cast_to(1.0f)); + } +}; + +void forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type) { + DataTypeDispatch2{}(input_type, output_type, input, output); +} + +void backward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type) { + DataTypeDispatch2{}(input_type, output_type, input, output); +} + +} // namespace CPU +} // namespace Cast +} // namespace Kernels +} // namespace FlexFlow diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc new file mode 100644 index 0000000000..5f63d29691 --- /dev/null +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -0,0 +1,61 @@ +#include "kernels/datatype_dispatch.h" +#include "kernels/replicate_kernels_cpu.h" + +namespace FlexFlow { +namespace Kernels { +namespace Replicate { +namespace CPU { + +template +void replicate_backward_kernel(T *input, + T const *output, + size_t num_elements, + size_t num_replicas) { + for (size_t i = 0; i < num_elements; ++i) { + T sum = 0; + for (size_t j = 0; j < num_replicas; ++j) { + sum += output[j * num_elements + i]; + } + input[i] = sum; + } +} + +// Why does replicate forward seem to only transfer memory? Shouldn't it also +// handle the replication? +template +struct ForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + memcpy(output.get(), + input.get(), + input.shape.num_elements() * size_of_datatype(T)); + } +}; + +template +struct BackwardKernel { + void operator()(GenericTensorAccessorW const &input, + GenericTensorAccessorR const &output, + size_t num_replicas) { + size_t total_elements = input.shape.num_elements() * num_replicas; + replicate_backward_kernel( + input.get(), output.get(), total_elements, num_replicas); + } +}; + +void forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, input, output); +} + +void backward_kernel(GenericTensorAccessorW const &input, + GenericTensorAccessorR const &output, + size_t num_replicas) { + DataTypeDispatch1{}( + input.data_type, input, output, num_replicas); +} + +} // namespace CPU +} // namespace Replicate +} // namespace Kernels +} // namespace FlexFlow diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc new file mode 100644 index 0000000000..ac8ae26ca2 --- /dev/null +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -0,0 +1,49 @@ +#include "kernels/reverse_kernels_cpu.h" +#include + +namespace FlexFlow { +namespace Kernels { +namespace Reverse { +namespace CPU { + +void reverse_forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size) { + coord_t total_elements = num_out_blks * reverse_dim_size * in_blk_size; + for (coord_t i = 0; i < total_elements; ++i) { + coord_t blk_idx = i / (reverse_dim_size * in_blk_size); + coord_t offset = i - blk_idx * (reverse_dim_size * in_blk_size); + coord_t reverse_dim_idx = offset / in_blk_size; + coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + + (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + + (offset % in_blk_size); + out_ptr[i] = in_ptr[in_idx]; + } +} + +void forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t output_size) { + reverse_forward_kernel( + in_ptr, out_ptr, num_out_blks, reverse_dim_size, in_blk_size); +} + +void backward_kernel(float const *out_grad_ptr, + float *in_grad_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t input_size) { + reverse_forward_kernel( + out_grad_ptr, in_grad_ptr, num_out_blks, reverse_dim_size, in_blk_size); +} + +} // namespace CPU +} // namespace Reverse +} // namespace Kernels +} // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index 8391a499df..f73c57dedf 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -21,6 +21,29 @@ namespace FlexFlow { namespace Kernels { namespace Reverse { +// __global__ void reverse_forward_kernel(float const *in_ptr, +// float *out_ptr, +// coord_t num_out_blks, +// coord_t reverse_dim_size, +// coord_t in_blk_size) { +// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { +// coord_t blk_idx = i / (reverse_dim_size * in_blk_size); +// i = i - blk_idx * (reverse_dim_size * in_blk_size); +// coord_t reverse_dim_idx = i / in_blk_size; +// i = i - reverse_dim_idx * in_blk_size; +// coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + +// (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + +// i; +// out_ptr[i] = in_ptr[in_idx]; +// } +// } + +/* I mentioned this earlier, but I still think the reverse_forward_kernel code + is incorrect, even though it matches the code in inference/master? Whenever + I'm testing the code and printing out the output, I'm getting unexpected + outputs, and I think it's a result of modifying the loop index i in the + previous code? +*/ __global__ void reverse_forward_kernel(float const *in_ptr, float *out_ptr, coord_t num_out_blks, @@ -28,12 +51,13 @@ __global__ void reverse_forward_kernel(float const *in_ptr, coord_t in_blk_size) { CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { coord_t blk_idx = i / (reverse_dim_size * in_blk_size); - i = i - blk_idx * (reverse_dim_size * in_blk_size); - coord_t reverse_dim_idx = i / in_blk_size; - i = i - reverse_dim_idx * in_blk_size; - coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + - (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i; - out_ptr[i] = in_ptr[in_idx]; + coord_t idx_within_blk = i % (reverse_dim_size * in_blk_size); + coord_t reverse_dim_idx = idx_within_blk / in_blk_size; + coord_t in_idx = idx_within_blk % in_blk_size; + coord_t input_index = + blk_idx * (reverse_dim_size * in_blk_size) + + (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + in_idx; + out_ptr[i] = in_ptr[input_index]; } } diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index 1f06e3ffd7..c37b83fa24 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -48,13 +48,13 @@ TEST_SUITE(FF_TEST_SUITE) { make_tensor_shape_from_legion_dims({state.weightSize}); GenericTensorAccessorW query_accessor = - create_random_filled_accessor_w(query_shape, allocator); + create_random_filled_accessor_w(query_shape, allocator); GenericTensorAccessorW key_accessor = - create_random_filled_accessor_w(key_shape, allocator); + create_random_filled_accessor_w(key_shape, allocator); GenericTensorAccessorW value_accessor = - create_random_filled_accessor_w(value_shape, allocator); + create_random_filled_accessor_w(value_shape, allocator); GenericTensorAccessorW weight_accessor = - create_random_filled_accessor_w(weight_shape, allocator); + create_random_filled_accessor_w(weight_shape, allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -69,22 +69,22 @@ TEST_SUITE(FF_TEST_SUITE) { weight_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output = load_data_to_host_from_device( + std::vector host_output = load_accessor_data( read_only_accessor_from_write_accessor(output_accessor)); CHECK(contains_non_zero(host_output)); } SUBCASE("backward_kernel") { GenericTensorAccessorW query_grad_accessor = - create_random_filled_accessor_w(query_shape, allocator); + create_random_filled_accessor_w(query_shape, allocator); GenericTensorAccessorW key_grad_accessor = - create_random_filled_accessor_w(key_shape, allocator); + create_random_filled_accessor_w(key_shape, allocator); GenericTensorAccessorW value_grad_accessor = - create_random_filled_accessor_w(value_shape, allocator); + create_random_filled_accessor_w(value_shape, allocator); GenericTensorAccessorW weight_grad_accessor = - create_random_filled_accessor_w(weight_shape, allocator); + create_random_filled_accessor_w(weight_shape, allocator); GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); Kernels::MultiHeadAttention::backward_kernel( managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index a1a0eee27e..51a50e6cf2 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -27,11 +27,11 @@ TEST_SUITE(FF_TEST_SUITE) { make_tensor_shape_from_legion_dims({m, n, batch}); GenericTensorAccessorW a_accessor = - create_random_filled_accessor_w(input_shape_a, allocator); + create_random_filled_accessor_w(input_shape_a, allocator); GenericTensorAccessorW b_accessor = - create_random_filled_accessor_w(input_shape_b, allocator); + create_random_filled_accessor_w(input_shape_b, allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { Kernels::BatchMatmul::forward_kernel(managed_stream.raw_stream(), @@ -50,7 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW o_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW a_grad_accessor = allocator.allocate_tensor(input_shape_a); GenericTensorAccessorW b_grad_accessor = diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 4cddf19c14..0d4682996a 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -37,9 +37,9 @@ TEST_SUITE(FF_TEST_SUITE) { {output_n, output_c, output_h, output_w}); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW scale_accessor = create_filled_accessor_w(scale_shape, allocator, 1.0f); @@ -54,21 +54,20 @@ TEST_SUITE(FF_TEST_SUITE) { scale_accessor.get_float_ptr(), bias_accessor.get_float_ptr()); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor)); CHECK(contains_non_zero(host_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW scale_grad_accessor = - create_random_filled_accessor_w(scale_shape, allocator); + create_random_filled_accessor_w(scale_shape, allocator); GenericTensorAccessorW bias_grad_accessor = - create_random_filled_accessor_w(bias_shape, allocator); + create_random_filled_accessor_w(bias_shape, allocator); Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(), state, @@ -82,13 +81,13 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.shape.num_elements()); std::vector host_input_grad_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor)); std::vector host_scale_grad_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(scale_grad_accessor)); std::vector host_bias_grad_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(bias_grad_accessor)); CHECK(contains_non_zero(host_input_grad_data)); diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 60f5a9d2d6..b77aa14406 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -17,12 +17,12 @@ TEST_SUITE(FF_TEST_SUITE) { make_tensor_shape_from_legion_dims({100, 100}); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); Kernels::Cast::forward_kernel(managed_stream.raw_stream(), input_accessor, @@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) { DataType::DOUBLE); std::vector host_double_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(output_accessor)); CHECK(contains_non_zero(host_double_data)); @@ -49,9 +49,65 @@ TEST_SUITE(FF_TEST_SUITE) { DataType::FLOAT); std::vector host_grad_float_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(grad_input_accessor)); CHECK(contains_non_zero(host_grad_float_data)); } } + + TEST_CASE("Check Cast Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = + make_tensor_shape_from_legion_dims({100, 100}); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({100, 100}); + + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(output_shape); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(output_shape); + + // Only calling forward kernel as backward kernel is exactly the same + SUBCASE("forward_kernel") { + auto transform = [start_val = 1.1f, + counter = 0.0f](float input) mutable -> float { + return start_val + counter++; + }; + + // Run GPU Forward Kernel + GenericTensorAccessorW input_accessor_gpu = + create_transformed_accessor_w( + input_shape, gpu_allocator, transform, false); + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), + read_only_accessor_from_write_accessor(input_accessor_gpu), + output_accessor_gpu, + DataType::FLOAT, + DataType::INT32); + std::vector result_data_gpu = + load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_gpu), + true); + + // Run CPU Forward Kernel + GenericTensorAccessorW input_accessor_cpu = + create_transformed_accessor_w( + input_shape, cpu_allocator, transform, true); + Kernels::Cast::CPU::forward_kernel( + read_only_accessor_from_write_accessor(input_accessor_cpu), + output_accessor_cpu, + DataType::FLOAT, + DataType::INT32); + std::vector result_data_cpu = + load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_cpu), + false); + + CHECK(result_data_gpu == result_data_cpu); + } + } } diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 7df6a09e0e..4c8e62a6e3 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -17,23 +17,22 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Combine::forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor)); CHECK(contains_non_zero(host_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_w(output_shape, allocator)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -41,7 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor, input_grad_accessor); - std::vector host_input_grad = load_data_to_host_from_device( + std::vector host_input_grad = load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor)); CHECK(contains_non_zero(host_input_grad)); } diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 46c1b894be..04bd4b5929 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -24,7 +24,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector input_accessors = repeat(num_inputs, [&]() { return read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); }); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -34,9 +34,8 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessors, concat_axis); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor)); CHECK(contains_non_zero(host_output_data)); } @@ -44,7 +43,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_w(output_shape, allocator)); std::vector input_grad_accessors = repeat( num_inputs, [&]() { return allocator.allocate_tensor(input_shape); }); diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 9e4d759eb9..c944a80b02 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -32,7 +32,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -42,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor.get_float_ptr()); std::vector host_output_accessor = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(output_accessor)); CHECK(contains_non_zero(host_output_accessor)); @@ -50,9 +50,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_data = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_data = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Dropout::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 71c1bca3d2..3f8ef38f0b 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -27,7 +27,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor.get_float_ptr()); std::vector check_output_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(output_accessor)); std::vector expected_output_data( @@ -47,7 +47,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor.get_float_ptr()); std::vector backward_output_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor)); std::vector expected_output_data( diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index b6c611e231..cfabef7ab2 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -19,12 +19,12 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR index_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_w(output_shape, allocator)); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -34,18 +34,17 @@ TEST_SUITE(FF_TEST_SUITE) { index_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor)); CHECK(contains_non_zero(host_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_w(output_shape, allocator)); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Gather::backward_kernel(managed_stream.raw_stream(), state, @@ -54,7 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor); std::vector host_input_grad_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor)); CHECK(contains_non_zero(host_input_grad_data)); } diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index b0cd1ffa78..5bb589607b 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -33,7 +33,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); GenericTensorAccessorW gamma_accessor = create_filled_accessor_w(feature_shape, allocator, 1.0f); @@ -54,9 +54,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_w(output_shape, allocator)); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW gamma_grad_accessor = allocator.allocate_tensor(feature_shape); GenericTensorAccessorW beta_grad_accessor = diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index dd525eb3ee..1e009b205a 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -29,7 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_stream.raw_stream(), state, input_accessor, output_accessor); std::vector check_output_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(output_accessor)); std::vector expected_output_data( @@ -50,7 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor); std::vector host_grad_input_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor)); std::vector expected_grad_input_data( diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index b0756b82a3..d6df1daa4a 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -44,9 +44,9 @@ TEST_SUITE(FF_TEST_SUITE) { {output_w, output_h, output_c, output_n}); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { Kernels::Pool2D::forward_kernel(managed_stream.raw_stream(), @@ -54,9 +54,8 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.ptr, output_accessor.ptr); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor)); CHECK(contains_non_zero(host_output_data)); } @@ -73,7 +72,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor.ptr, output_grad_accessor.ptr); - std::vector host_input_grad = load_data_to_host_from_device( + std::vector host_input_grad = load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor)); CHECK(contains_non_zero(host_input_grad)); } diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index d2483d28ed..5dcf85e39d 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -22,7 +22,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -31,9 +31,8 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor, num_replicas); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor)); CHECK(contains_non_zero(host_output_data)); } @@ -52,7 +51,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector expected_grad_input_data( input_grad_accessor.shape.num_elements(), 1.0f); - std::vector host_grad_data = load_data_to_host_from_device( + std::vector host_grad_data = load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor)); CHECK(host_grad_data == expected_grad_input_data); } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 6a4c03c5c5..ab1c7c3228 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/replicate_kernels.h" +#include "kernels/replicate_kernels_cpu.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -9,7 +10,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = make_tensor_shape_from_legion_dims({100}); - TensorShape output_shape = input_shape; + TensorShape output_shape = + make_tensor_shape_from_legion_dims({100}); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -27,7 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_stream.raw_stream(), input_accessor, output_accessor); std::vector check_output_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(output_accessor)); std::vector expected_output_data( @@ -48,9 +50,95 @@ TEST_SUITE(FF_TEST_SUITE) { num_replicas); std::vector check_aggregated_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor)); CHECK(contains_non_zero(check_aggregated_data)); } } + + TEST_CASE("Check Replicate Forward Kernel against CPU Kernel") { + std::size_t num_replicas = 10; + + // This should be like three shapes: pre_replication, replication shape, and + // reduced shape, but things are weird cause doesn't seem to be replicating + // anything + TensorShape input_shape = + make_tensor_shape_from_legion_dims({10, num_replicas}); + TensorShape replicated_shape = + make_tensor_shape_from_legion_dims({10, num_replicas}); + TensorShape reduced_shape = + make_tensor_shape_from_legion_dims({10}); + + ManagedPerDeviceFFHandle managed_handle{}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("forward_kernel") { + // Run GPU Replicate Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + read_only_accessor_from_write_accessor( + create_random_filled_accessor_w(input_shape, gpu_allocator)); + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(replicated_shape); + + Kernels::Replicate::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + std::vector result_data_gpu = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_gpu), true); + + // Run CPU Replicate Forward Kernel + GenericTensorAccessorW input_accessor_cpu = + copy_tensor_between_memories( + input_accessor_gpu, input_shape, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(replicated_shape); + + Kernels::Replicate::CPU::forward_kernel( + read_only_accessor_from_write_accessor(input_accessor_cpu), + output_accessor_cpu); + + std::vector result_data_cpu = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_cpu), false); + + CHECK(result_data_gpu == result_data_cpu); + } + + SUBCASE("backward_kernel") { + GenericTensorAccessorR output_grad_accessor_gpu = + read_only_accessor_from_write_accessor( + create_random_filled_accessor_w(replicated_shape, gpu_allocator)); + GenericTensorAccessorW input_grad_accessor_gpu = + gpu_allocator.allocate_tensor(reduced_shape); + + Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), + input_grad_accessor_gpu, + output_grad_accessor_gpu, + num_replicas); + + std::vector result_data_gpu = load_accessor_data( + read_only_accessor_from_write_accessor(input_grad_accessor_gpu), + true); + + GenericTensorAccessorW output_grad_accessor_cpu = + copy_tensor_between_memories( + output_grad_accessor_gpu, replicated_shape, cpu_allocator); + + GenericTensorAccessorW input_grad_accessor_cpu = + cpu_allocator.allocate_tensor(reduced_shape); + + Kernels::Replicate::CPU::backward_kernel( + input_grad_accessor_cpu, + read_only_accessor_from_write_accessor(output_grad_accessor_cpu), + num_replicas); + + std::vector result_data_cpu = load_accessor_data( + read_only_accessor_from_write_accessor(input_grad_accessor_cpu), + false); + + CHECK(result_data_gpu == result_data_cpu); + } + } } diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index 0d36b9796c..e1a8ccc4b7 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_stream.raw_stream(), state, input_accessor, output_accessor); std::vector check_output_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(output_accessor)); std::vector expected_output_data( @@ -49,7 +49,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor); std::vector host_grad_input_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor)); std::vector expected_grad_input_data( diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 0bbcec75a7..ff9a6c23a5 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/reverse_kernels.h" +#include "kernels/reverse_kernels_cpu.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -10,7 +11,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::size_t num_out_blks = 1; TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}); + make_tensor_shape_from_legion_dims( + {num_out_blks, reverse_dim_size, in_blk_size}); TensorShape output_shape = input_shape; ManagedPerDeviceFFHandle managed_handle{}; @@ -34,16 +36,17 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.shape.num_elements()); std::vector check_output_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(output_accessor)); + CHECK(contains_non_zero(check_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + allocator.allocate_tensor(input_shape); Kernels::Reverse::backward_kernel( managed_stream.raw_stream(), @@ -55,9 +58,115 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor.shape.num_elements()); std::vector host_grad_input_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor)); + CHECK(contains_non_zero(host_grad_input_data)); } } + + TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { + std::size_t num_out_blks = 2; + std::size_t reverse_dim_size = 3; + std::size_t in_blk_size = 5; + + TensorShape input_shape = + make_tensor_shape_from_legion_dims( + {num_out_blks, reverse_dim_size, in_blk_size}); + TensorShape output_shape = input_shape; + + ManagedPerDeviceFFHandle managed_handle{}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("forward_kernel") { + auto transform = [counter = 0.0f](float val) mutable { + return counter++; + }; + + // Run GPU Cast Forward Kernel + GenericTensorAccessorW input_accessor_gpu = + create_transformed_accessor_w( + input_shape, gpu_allocator, transform, false); + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(output_shape); + + Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), + input_accessor_gpu.get_float_ptr(), + output_accessor_gpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_accessor_gpu.shape.num_elements()); + + std::vector result_data_gpu = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_gpu), true); + + // Run CPU Cast Forward Kernel + GenericTensorAccessorW input_accessor_cpu = + create_transformed_accessor_w( + input_shape, cpu_allocator, transform, true); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(output_shape); + + Kernels::Reverse::CPU::forward_kernel( + input_accessor_cpu.get_float_ptr(), + output_accessor_cpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_accessor_cpu.shape.num_elements()); + + std::vector result_data_cpu = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_cpu), false); + + CHECK(result_data_gpu == result_data_cpu); + } + + SUBCASE("backward_kernel") { + // Run GPU Cast Backward Kernel + GenericTensorAccessorW output_grad_accessor_gpu = + create_random_filled_accessor_w(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + gpu_allocator.allocate_tensor(input_shape); + + Kernels::Reverse::backward_kernel( + managed_stream.raw_stream(), + output_grad_accessor_gpu.get_float_ptr(), + input_grad_accessor_gpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_grad_accessor_gpu.shape.num_elements()); + + std::vector result_data_gpu = load_accessor_data( + read_only_accessor_from_write_accessor(input_grad_accessor_gpu), + true); + + // Run CPU Cast Backward Kernel + GenericTensorAccessorW output_grad_accessor_cpu = + copy_tensor_between_memories( + read_only_accessor_from_write_accessor(output_grad_accessor_gpu), + output_shape, + cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + cpu_allocator.allocate_tensor(input_shape); + + Kernels::Reverse::CPU::backward_kernel( + output_grad_accessor_cpu.get_float_ptr(), + input_grad_accessor_cpu.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input_grad_accessor_cpu.shape.num_elements()); + + std::vector result_data_cpu = load_accessor_data( + read_only_accessor_from_write_accessor(input_grad_accessor_cpu), + false); + + CHECK(result_data_gpu == result_data_cpu); + } + } } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index d541395066..a9f7fa8bc0 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -21,20 +21,19 @@ TEST_SUITE(FF_TEST_SUITE) { managed_handle.raw_handle(), 0, input_n, channels, input_h, input_w); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Softmax::forward_kernel(managed_stream.raw_stream(), state, input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor)); CHECK(contains_non_zero(host_output_data)); } @@ -53,7 +52,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector expected_input_grad_data = std::vector(input_grad_accessor.shape.num_elements(), 1.0f); std::vector host_input_grad_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor)); CHECK(host_input_grad_data == expected_input_grad_data); } diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index c4685921a9..304a7ba121 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -23,7 +23,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); std::vector output_ptrs = repeat(num_outputs, [&]() { GenericTensorAccessorW output_accessor = @@ -44,7 +44,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector output_grad_ptrs(num_outputs); for (int i = 0; i < num_outputs; i++) { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, allocator); output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index c29e3ceb2c..a4cbf37c4b 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -24,25 +24,24 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_w(input_shape, allocator)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Transpose::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor)); CHECK(contains_non_zero(host_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_w(output_shape, allocator)); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), state, @@ -50,7 +49,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor); std::vector host_grad_input_data = - load_data_to_host_from_device( + load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor)); CHECK(contains_non_zero(host_grad_input_data)); } diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc new file mode 100644 index 0000000000..c9d2bf0a7c --- /dev/null +++ b/lib/kernels/test/src/test_utils.cc @@ -0,0 +1,24 @@ +#include "test_utils.h" + +GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + bool on_host) { + GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + size_t volume = accessor.shape.num_elements(); + std::vector host_data(volume); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + for (auto &val : host_data) { + val = dist(gen); + } + + transfer_memory(static_cast(accessor.ptr), + host_data.data(), + volume, + GpuDirection::HostToDevice, + on_host); + + return accessor; +} diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index d78f3b377a..9478cc2a57 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -8,78 +8,99 @@ #include "kernels/managed_per_device_ff_handle.h" #include -template -GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill = false) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); - std::vector
host_data(volume); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution
dist(-1.0f, 1.0f); - - for (auto &val : host_data) { - val = dist(gen); - } +enum class GpuDirection { + HostToDevice = 0, + DeviceToHost = 1, + DeviceToDevice = 2 +}; - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT)); +template +void transfer_memory(DT *dst, + const DT *src, + size_t num_elements, + GpuDirection gpu_dir, + bool cpu_memory) { + size_t bytes = num_elements * sizeof(DT); + + if (cpu_memory) { + memcpy(dst, src, bytes); } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(DT), - cudaMemcpyHostToDevice)); + switch (gpu_dir) { + case GpuDirection::HostToDevice: + checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice)); + break; + case GpuDirection::DeviceToHost: + checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost)); + break; + case GpuDirection::DeviceToDevice: + checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice)); + break; + } } - - return accessor; } +GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + bool on_host = false); + template GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, Allocator &allocator, DT val, - bool cpu_fill = false) { + bool on_host = false) { GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); size_t volume = accessor.shape.num_elements(); std::vector
host_data(volume, val); - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(DT), - cudaMemcpyHostToDevice)); - } + transfer_memory(static_cast
(accessor.ptr), + host_data.data(), + volume, + GpuDirection::HostToDevice, + on_host); return accessor; } -template -GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, +template +GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape, Allocator &allocator, - bool cpu_fill = false) { + F transform, + bool on_host = false) { GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); - std::vector
host_data(volume); + size_t volume = accessor.shape.get_volume(); + std::vector input_data(volume); + std::vector output_data(volume); - for (size_t i = 0; i < volume; i++) { - host_data[i] = i; - } + std::transform( + input_data.begin(), input_data.end(), output_data.begin(), transform); - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(DT)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(DT), - cudaMemcpyHostToDevice)); - } + transfer_memory(static_cast(accessor.ptr), + output_data.data(), + volume, + GpuDirection::HostToDevice, + on_host); return accessor; } +template +GenericTensorAccessorW + copy_tensor_between_memories(GenericTensorAccessorR accessor, + TensorShape const &shape, + Allocator &allocator, + bool src_on_host = false) { + GenericTensorAccessorW copied_accessor = allocator.allocate_tensor(shape); + + size_t volume = accessor.shape.get_volume(); + GpuDirection gpu_dir = + src_on_host ? GpuDirection::HostToDevice : GpuDirection::DeviceToHost; + + transfer_memory( + copied_accessor.get
(), accessor.get
(), volume, gpu_dir, false); + + return copied_accessor; +} + template TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims) { return TensorShape{ @@ -90,15 +111,24 @@ TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims) { }; } -template -std::vector
load_data_to_host_from_device(GenericTensorAccessorR accessor) { +template +std::vector> load_accessor_data(GenericTensorAccessorR accessor, + bool on_device = true) { int volume = accessor.shape.get_volume(); - std::vector
local_data(volume); - checkCUDA(cudaMemcpy(local_data.data(), - accessor.ptr, - local_data.size() * sizeof(DT), - cudaMemcpyDeviceToHost)); + using T = real_type
; + std::vector local_data(volume); + T const *src_ptr = accessor.get
(); + + if (on_device) { + checkCUDA(cudaMemcpy(local_data.data(), + src_ptr, + volume * sizeof(T), + cudaMemcpyDeviceToHost)); + } else { + std::memcpy(local_data.data(), src_ptr, volume * sizeof(T)); + } + return local_data; } From 968cd6d3cbbe172e1600b2dcf789b2d71207c033 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Sun, 14 Jul 2024 15:45:59 -0700 Subject: [PATCH 03/20] combine kernel --- lib/kernels/include/kernels/cast_kernels.h | 2 - lib/kernels/src/cpu/combine_kernels.cc | 44 +++++++++++ lib/kernels/src/cpu/replicate_kernels.cc | 2 +- lib/kernels/src/cpu/reverse_kernels.cc | 1 - lib/kernels/src/local_cpu_allocator.cc | 4 +- lib/kernels/src/local_cuda_allocator.cc | 1 + lib/kernels/test/src/test_cast_kernel.cc | 4 +- lib/kernels/test/src/test_combine_kernel.cc | 79 ++++++++++++++++++- lib/kernels/test/src/test_replicate_kernel.cc | 15 ++-- lib/kernels/test/src/test_reverse_kernels.cc | 8 +- lib/kernels/test/src/test_utils.h | 12 +-- 11 files changed, 144 insertions(+), 28 deletions(-) diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h index 96f9aadd52..502a823ca7 100644 --- a/lib/kernels/include/kernels/cast_kernels.h +++ b/lib/kernels/include/kernels/cast_kernels.h @@ -3,8 +3,6 @@ #include "device.h" #include "kernels/accessor.h" -#include "kernels/ff_handle.h" -#include "op-attrs/activation.dtg.h" namespace FlexFlow { namespace Kernels { diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc index e69de29bb2..f1950a56d2 100644 --- a/lib/kernels/src/cpu/combine_kernels.cc +++ b/lib/kernels/src/cpu/combine_kernels.cc @@ -0,0 +1,44 @@ +#include "kernels/combine_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow { +namespace Kernels { +namespace Combine { +namespace CPU { + +template +struct ForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + memcpy(output.get
(), + input.get
(), + input.shape.get_volume() * size_of_datatype(DT)); + } +}; + +template +struct BackwardKernel { + void operator()(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + size_t num_elements = output_grad.shape.get_volume(); + for (int i = 0; i < num_elements; ++i) { + input_grad.get
()[i] += output_grad.get
()[i]; + } + } +}; + +void forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, input, output); +} + +void backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + DataTypeDispatch1{}( + input_grad.data_type, output_grad, input_grad); +} + +} // namespace CPU +} // namespace Combine +} // namespace Kernels +} // namespace FlexFlow diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index 5f63d29691..a26d2054d1 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -14,7 +14,7 @@ void replicate_backward_kernel(T *input, for (size_t i = 0; i < num_elements; ++i) { T sum = 0; for (size_t j = 0; j < num_replicas; ++j) { - sum += output[j * num_elements + i]; + sum += output[i + j * num_elements]; } input[i] = sum; } diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index ac8ae26ca2..b035f03721 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -1,5 +1,4 @@ #include "kernels/reverse_kernels_cpu.h" -#include namespace FlexFlow { namespace Kernels { diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc index 6553dc2f88..9cc86c44ca 100644 --- a/lib/kernels/src/local_cpu_allocator.cc +++ b/lib/kernels/src/local_cpu_allocator.cc @@ -3,12 +3,14 @@ namespace FlexFlow { void *LocalCPUAllocator::allocate(size_t requested_memory_size) { - void *ptr = malloc(requested_memory_size); + void *ptr = calloc(1, requested_memory_size); + if (ptr != nullptr) { this->ptrs.insert(ptr); } else { throw std::bad_alloc(); } + return ptr; } diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc index 931e81c0b8..c82abc765d 100644 --- a/lib/kernels/src/local_cuda_allocator.cc +++ b/lib/kernels/src/local_cuda_allocator.cc @@ -5,6 +5,7 @@ namespace FlexFlow { void *LocalCudaAllocator::allocate(size_t requested_memory_size) { void *ptr; checkCUDA(cudaMalloc(&ptr, requested_memory_size)); + checkCUDA(cudaMemset(ptr, 0, requested_memory_size)); this->ptrs.insert(ptr); return ptr; } diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index b77aa14406..e7da356564 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -91,7 +91,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_gpu = load_accessor_data( read_only_accessor_from_write_accessor(output_accessor_gpu), - true); + false); // Run CPU Forward Kernel GenericTensorAccessorW input_accessor_cpu = @@ -105,7 +105,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_cpu = load_accessor_data( read_only_accessor_from_write_accessor(output_accessor_cpu), - false); + true); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 4c8e62a6e3..60c55ca062 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -1,10 +1,11 @@ #include "doctest/doctest.h" #include "kernels/combine_kernels.h" +#include "kernels/combine_kernels_cpu.h" #include "test_utils.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test combine kernel") { + TEST_CASE("Call Combine Forward and Backward Kernels") { ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -45,4 +46,80 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(contains_non_zero(host_input_grad)); } } + + TEST_CASE("Check Combine Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = + make_tensor_shape_from_legion_dims({5, 5}); + TensorShape output_shape = input_shape; + + SUBCASE("forward_kernel") { + // Run GPU Combine Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + read_only_accessor_from_write_accessor( + create_random_filled_accessor_w(input_shape, gpu_allocator)); + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(output_shape); + + Kernels::Combine::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + std::vector result_data_gpu = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_gpu), false); + + // Run CPU Combine Forward Kernel + GenericTensorAccessorW input_accessor_cpu = + copy_tensor_between_memories( + input_accessor_gpu, input_shape, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(output_shape); + + Kernels::Combine::CPU::forward_kernel( + read_only_accessor_from_write_accessor(input_accessor_cpu), + output_accessor_cpu); + + std::vector result_data_cpu = load_accessor_data( + read_only_accessor_from_write_accessor(output_accessor_cpu), true); + + CHECK(result_data_gpu == result_data_cpu); + } + + SUBCASE("backward_kernel") { + // Run GPU Combine Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + read_only_accessor_from_write_accessor( + create_random_filled_accessor_w(output_shape, gpu_allocator)); + GenericTensorAccessorW input_grad_accessor_gpu = + gpu_allocator.allocate_tensor(input_shape); + + Kernels::Combine::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu); + + std::vector result_data_gpu = load_accessor_data( + read_only_accessor_from_write_accessor(input_grad_accessor_gpu), + false); + + // Run CPU Combine Backward Kernel + GenericTensorAccessorW output_grad_accessor_cpu = + copy_tensor_between_memories( + output_grad_accessor_gpu, output_shape, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + cpu_allocator.allocate_tensor(input_shape); + + Kernels::Combine::CPU::backward_kernel( + read_only_accessor_from_write_accessor(output_grad_accessor_cpu), + input_grad_accessor_cpu); + + std::vector result_data_cpu = load_accessor_data( + read_only_accessor_from_write_accessor(input_grad_accessor_cpu), + true); + + CHECK(result_data_gpu == result_data_cpu); + } + } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index ab1c7c3228..49807355e1 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -5,7 +5,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test Replicate Kernel") { + TEST_CASE("Call Replicate Forward and Backward Kernels") { std::size_t num_replicas = 10; TensorShape input_shape = @@ -61,7 +61,7 @@ TEST_SUITE(FF_TEST_SUITE) { // This should be like three shapes: pre_replication, replication shape, and // reduced shape, but things are weird cause doesn't seem to be replicating - // anything + // anything (ie. input shape should be same as reduced shape) TensorShape input_shape = make_tensor_shape_from_legion_dims({10, num_replicas}); TensorShape replicated_shape = @@ -87,7 +87,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), true); + read_only_accessor_from_write_accessor(output_accessor_gpu), false); // Run CPU Replicate Forward Kernel GenericTensorAccessorW input_accessor_cpu = @@ -101,12 +101,13 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor_cpu); std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), false); + read_only_accessor_from_write_accessor(output_accessor_cpu), true); CHECK(result_data_gpu == result_data_cpu); } SUBCASE("backward_kernel") { + // Run GPU Replicate Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = read_only_accessor_from_write_accessor( create_random_filled_accessor_w(replicated_shape, gpu_allocator)); @@ -120,12 +121,12 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_gpu = load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - true); + false); + // Run CPU Replicate Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = copy_tensor_between_memories( output_grad_accessor_gpu, replicated_shape, cpu_allocator); - GenericTensorAccessorW input_grad_accessor_cpu = cpu_allocator.allocate_tensor(reduced_shape); @@ -136,7 +137,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_cpu = load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - false); + true); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index ff9a6c23a5..fc7acc99cd 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -102,7 +102,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor_gpu.shape.num_elements()); std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), true); + read_only_accessor_from_write_accessor(output_accessor_gpu), false); // Run CPU Cast Forward Kernel GenericTensorAccessorW input_accessor_cpu = @@ -120,7 +120,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor_cpu.shape.num_elements()); std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), false); + read_only_accessor_from_write_accessor(output_accessor_cpu), true); CHECK(result_data_gpu == result_data_cpu); } @@ -143,7 +143,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_gpu = load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - true); + false); // Run CPU Cast Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = @@ -164,7 +164,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector result_data_cpu = load_accessor_data( read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - false); + true); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index 9478cc2a57..1ce9e7a3d7 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -113,21 +113,15 @@ TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims) { template std::vector> load_accessor_data(GenericTensorAccessorR accessor, - bool on_device = true) { + bool on_host = false) { int volume = accessor.shape.get_volume(); using T = real_type
; std::vector local_data(volume); T const *src_ptr = accessor.get
(); - if (on_device) { - checkCUDA(cudaMemcpy(local_data.data(), - src_ptr, - volume * sizeof(T), - cudaMemcpyDeviceToHost)); - } else { - std::memcpy(local_data.data(), src_ptr, volume * sizeof(T)); - } + transfer_memory( + local_data.data(), src_ptr, volume, GpuDirection::DeviceToHost, on_host); return local_data; } From 723515bbdb198b97ae04fec110388b9c7edc1036 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Sun, 14 Jul 2024 15:58:40 -0700 Subject: [PATCH 04/20] combine kernels .h file --- .../include/kernels/combine_kernels_cpu.h | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 lib/kernels/include/kernels/combine_kernels_cpu.h diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h new file mode 100644 index 0000000000..1d30297af1 --- /dev/null +++ b/lib/kernels/include/kernels/combine_kernels_cpu.h @@ -0,0 +1,23 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H + +#include "device.h" +#include "kernels/accessor.h" + +namespace FlexFlow { +namespace Kernels { +namespace Combine { +namespace CPU { + +void forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); + +} // namespace CPU +} // namespace Combine +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H From ba586aed2093e71ec430bc22cf8c13c8d1e3100d Mon Sep 17 00:00:00 2001 From: Marsella8 <45826022+Marsella8@users.noreply.github.com> Date: Thu, 18 Jul 2024 17:38:22 -0700 Subject: [PATCH 05/20] Implementations for methods for machine_views and associated modules (#1429) * initial commit for machine view adjacent modules * Formatting * Tests for new machine_view.cc functions * formatting * Minor Test correction * formatting * PR fixes * PR Fixes --------- Co-authored-by: Pietro Max Marsella --- lib/pcg/include/pcg/device_id.h | 2 +- lib/pcg/include/pcg/machine_view.h | 19 +++++- lib/pcg/include/pcg/strided_rectangle.h | 5 +- lib/pcg/src/pcg/machine_view.cc | 75 +++++++++++++++++++--- lib/pcg/src/pcg/strided_rectangle_side.cc | 6 +- lib/pcg/src/strided_rectangle.cc | 17 +++-- lib/pcg/test/src/test_machine_view.cc | 74 +++++++++++++++++++++ lib/pcg/test/src/test_strided_rectangle.cc | 37 +++++++++++ 8 files changed, 216 insertions(+), 19 deletions(-) create mode 100644 lib/pcg/test/src/test_machine_view.cc create mode 100644 lib/pcg/test/src/test_strided_rectangle.cc diff --git a/lib/pcg/include/pcg/device_id.h b/lib/pcg/include/pcg/device_id.h index be92be7081..1157a2932a 100644 --- a/lib/pcg/include/pcg/device_id.h +++ b/lib/pcg/include/pcg/device_id.h @@ -10,7 +10,7 @@ namespace FlexFlow { device_id_t operator+(device_id_t, size_t); -DeviceType get_device_type(device_id_t); +DeviceType get_device_type(device_id_t const &device_id); gpu_id_t unwrap_gpu(device_id_t); cpu_id_t unwrap_cpu(device_id_t); diff --git a/lib/pcg/include/pcg/machine_view.h b/lib/pcg/include/pcg/machine_view.h index 625b128d35..56abf5aa20 100644 --- a/lib/pcg/include/pcg/machine_view.h +++ b/lib/pcg/include/pcg/machine_view.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_PCG_INCLUDE_PCG_MACHINE_VIEW_H #include "pcg/cpu_id_t.dtg.h" +#include "pcg/device_id.h" #include "pcg/device_id_t.dtg.h" #include "pcg/device_type.dtg.h" #include "pcg/gpu_id_t.dtg.h" @@ -14,15 +15,31 @@ namespace FlexFlow { std::vector device_ids(MachineView const &); -std::size_t num_dims(MachineView const &); +size_t num_dims(MachineView const &); std::size_t num_devices(MachineView const &); DeviceType get_device_type(MachineView const &); MachineView make_1d_machine_view(gpu_id_t start, gpu_id_t stop, int stride = 1); MachineView make_1d_machine_view(cpu_id_t start, cpu_id_t stop, int stride = 1); +MachineView + make_1d_machine_view(device_id_t start, device_id_t stop, int stride = 1); + +MachineView make_1d_machine_view(gpu_id_t start, + num_points_t num_points, + int stride = 1); +MachineView make_1d_machine_view(cpu_id_t start, + num_points_t num_points, + int stride = 1); MachineView make_1d_machine_view(device_id_t start, num_points_t num_points, int stride = 1); + +MachineView make_1d_machine_view(gpu_id_t start, + side_size_t interval_size, + int stride = 1); +MachineView make_1d_machine_view(cpu_id_t start, + side_size_t interval_size, + int stride = 1); MachineView make_1d_machine_view(device_id_t start, side_size_t interval_size, int stride = 1); diff --git a/lib/pcg/include/pcg/strided_rectangle.h b/lib/pcg/include/pcg/strided_rectangle.h index 24ae51ac41..9c3b8eeda9 100644 --- a/lib/pcg/include/pcg/strided_rectangle.h +++ b/lib/pcg/include/pcg/strided_rectangle.h @@ -8,8 +8,9 @@ namespace FlexFlow { size_t get_num_dims(StridedRectangle const &); -StridedRectangleSide get_side_at_idx(StridedRectangle const &, - ff_dim_t const &); +StridedRectangleSide get_side_at_idx(StridedRectangle const &rect, + ff_dim_t const &idx); +num_points_t get_num_points(StridedRectangle const &rect); } // namespace FlexFlow diff --git a/lib/pcg/src/pcg/machine_view.cc b/lib/pcg/src/pcg/machine_view.cc index 00bf1296fe..c09ab1a3c9 100644 --- a/lib/pcg/src/pcg/machine_view.cc +++ b/lib/pcg/src/pcg/machine_view.cc @@ -1,5 +1,7 @@ #include "pcg/machine_view.h" +#include "pcg/device_id.h" #include "pcg/strided_rectangle.dtg.h" +#include "pcg/strided_rectangle.h" #include "pcg/strided_rectangle_side.h" namespace FlexFlow { @@ -8,16 +10,16 @@ std::vector device_ids(MachineView const &) { NOT_IMPLEMENTED(); } -std::size_t num_dims(MachineView const &) { - NOT_IMPLEMENTED(); +std::size_t num_dims(MachineView const &mv) { + return get_num_dims(mv.rect); } -std::size_t num_devices(MachineView const &) { - NOT_IMPLEMENTED(); +size_t num_devices(MachineView const &mv) { + return get_num_points(mv.rect).unwrapped; } -DeviceType get_device_type(MachineView const &) { - NOT_IMPLEMENTED(); +DeviceType get_device_type(MachineView const &mv) { + return get_device_type(mv.start); } static StridedRectangle make_1d_rect(int start, int stop, int stride) { @@ -40,18 +42,73 @@ MachineView make_1d_machine_view(cpu_id_t start, cpu_id_t stop, int stride) { return MachineView{device_id_t{start}, rect}; } +MachineView + make_1d_machine_view(device_id_t start, device_id_t stop, int stride) { + assert(get_device_type(start) == get_device_type(stop)); + if (get_device_type(start) == DeviceType::CPU) { + return make_1d_machine_view(unwrap_cpu(start), unwrap_cpu(stop), stride); + } + assert(get_device_type(start) == DeviceType::GPU); + return make_1d_machine_view(unwrap_gpu(start), unwrap_gpu(stop), stride); +} + +static StridedRectangle + make_1d_rect(int start, num_points_t num_points, int stride) { + return make_1d_rect(start, start + num_points.unwrapped * stride, stride); +} + +MachineView + make_1d_machine_view(cpu_id_t start, num_points_t num_points, int stride) { + StridedRectangle rect = make_1d_rect(start.cpu_index, num_points, stride); + return MachineView{device_id_t{start}, rect}; +} + +MachineView + make_1d_machine_view(gpu_id_t start, num_points_t num_points, int stride) { + StridedRectangle rect = make_1d_rect(start.gpu_index, num_points, stride); + return MachineView{device_id_t{start}, rect}; +} + MachineView make_1d_machine_view(device_id_t start, num_points_t num_points, int stride) { - NOT_IMPLEMENTED(); + if (get_device_type(start) == DeviceType::CPU) { + return make_1d_machine_view(unwrap_cpu(start), num_points, stride); + } else { + assert(get_device_type(start) == DeviceType::GPU); + return make_1d_machine_view(unwrap_gpu(start), num_points, stride); + } } -MachineView make_1d_machine_view(device_id_t start, +static StridedRectangle + make_1d_rect(int start, side_size_t interval_size, int stride) { + return make_1d_rect(start, start + interval_size.unwrapped, stride); +} + +MachineView make_1d_machine_view(cpu_id_t start, side_size_t interval_size, int stride) { - NOT_IMPLEMENTED(); + StridedRectangle rect = make_1d_rect(start.cpu_index, interval_size, stride); + return MachineView{device_id_t{start}, rect}; +} + +MachineView make_1d_machine_view(gpu_id_t start, + side_size_t interval_size, + int stride) { + StridedRectangle rect = make_1d_rect(start.gpu_index, interval_size, stride); + return MachineView{device_id_t{start}, rect}; } +MachineView make_1d_machine_view(device_id_t start, + side_size_t interval_size, + int stride) { + if (get_device_type(start) == DeviceType::CPU) { + return make_1d_machine_view(unwrap_cpu(start), interval_size, stride); + } else { + assert(get_device_type(start) == DeviceType::GPU); + return make_1d_machine_view(unwrap_gpu(start), interval_size, stride); + } +} MachineView make_1d_machine_view(device_id_t start, size_t interval_size) { NOT_IMPLEMENTED(); } diff --git a/lib/pcg/src/pcg/strided_rectangle_side.cc b/lib/pcg/src/pcg/strided_rectangle_side.cc index 5e7274141d..e6caf4cb86 100644 --- a/lib/pcg/src/pcg/strided_rectangle_side.cc +++ b/lib/pcg/src/pcg/strided_rectangle_side.cc @@ -3,9 +3,11 @@ namespace FlexFlow { -StridedRectangleSide strided_side_from_size_and_stride(side_size_t, +StridedRectangleSide strided_side_from_size_and_stride(side_size_t side_size, int stride) { - NOT_IMPLEMENTED(); + assert((side_size.unwrapped % stride) == 0); + return StridedRectangleSide{num_points_t{side_size.unwrapped / stride}, + stride}; } side_size_t get_side_size(StridedRectangleSide const &s) { diff --git a/lib/pcg/src/strided_rectangle.cc b/lib/pcg/src/strided_rectangle.cc index 9c8ff69b42..1c61424ab9 100644 --- a/lib/pcg/src/strided_rectangle.cc +++ b/lib/pcg/src/strided_rectangle.cc @@ -1,4 +1,5 @@ #include "pcg/strided_rectangle.h" +#include "op-attrs/dim_ordered/transform.h" #include "utils/containers.h" namespace FlexFlow { @@ -15,12 +16,20 @@ namespace FlexFlow { /* return idx; */ /* } */ -size_t get_num_dims(StridedRectangle const &) { - NOT_IMPLEMENTED(); +size_t get_num_dims(StridedRectangle const &rect) { + return rect.sides.size(); } -size_t get_side_at_idx(StridedRectangle const &) { - NOT_IMPLEMENTED(); +num_points_t get_num_points(StridedRectangle const &rect) { + return num_points_t{ + product(transform(rect.sides, [](StridedRectangleSide const &side) { + return side.num_points.unwrapped; + }))}; +} + +StridedRectangleSide get_side_at_idx(StridedRectangle const &rect, + ff_dim_t const &idx) { + return rect.sides.at(idx); } } // namespace FlexFlow diff --git a/lib/pcg/test/src/test_machine_view.cc b/lib/pcg/test/src/test_machine_view.cc new file mode 100644 index 0000000000..92a96d5e9a --- /dev/null +++ b/lib/pcg/test/src/test_machine_view.cc @@ -0,0 +1,74 @@ +#include "doctest/doctest.h" +#include "pcg/machine_view.h" +#include "pcg/strided_rectangle.h" +#include "pcg/strided_rectangle_side.h" + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("MachineView general util functions") { + StridedRectangle rect{{StridedRectangleSide{num_points_t{7}, 5}, + StridedRectangleSide{num_points_t{10}, 2}}}; + gpu_id_t start(1); + MachineView mv{device_id_t{start}, rect}; + SUBCASE("num_dims") { + CHECK(num_dims(mv) == 2); + } + SUBCASE("num_devices") { + CHECK(num_devices(mv) == 7 * 10); + } + SUBCASE("get_device_type") { + CHECK(get_device_type(mv) == DeviceType::GPU); + } + } + + TEST_CASE("MachineView make_1d_machine_view - GPU") { + StridedRectangle rect{{StridedRectangleSide{num_points_t{7}, 5}}}; + device_id_t start_gpu{gpu_id_t{1}}; + MachineView gpu_mv{start_gpu, rect}; + + SUBCASE("make_1d_machine_view(gpu_id_t start, gpu_id_t stop, int stride)") { + MachineView result = + make_1d_machine_view(start_gpu, device_id_t{gpu_id_t(1 + 7 * 5)}, 5); + MachineView correct = gpu_mv; + CHECK(result == correct); + } + SUBCASE("make_1d_machine_view(gpu_id_t start, num_points_t num_points, int " + "stride)") { + MachineView result = make_1d_machine_view(start_gpu, num_points_t{7}, 5); + MachineView correct = gpu_mv; + CHECK(result == correct); + } + SUBCASE("make_1d_machine_view(gpu_id_t start, side_size_t interval_size, " + "int stride)") { + MachineView result = make_1d_machine_view( + start_gpu, get_side_size(rect.sides.at(ff_dim_t{0})), 5); + MachineView correct = gpu_mv; + CHECK(result == correct); + } + } + + TEST_CASE("MachineView make_1d_machine_view - CPU") { + StridedRectangle rect{{StridedRectangleSide{num_points_t{11}, 4}}}; + device_id_t start_cpu{cpu_id_t{2}}; + MachineView cpu_mv{start_cpu, rect}; + + SUBCASE("make_1d_machine_view(cpu_id_t start, cpu_id_t stop, int stride)") { + MachineView result = + make_1d_machine_view(start_cpu, device_id_t{cpu_id_t(2 + 11 * 4)}, 4); + MachineView correct = cpu_mv; + CHECK(result == correct); + } + SUBCASE("make_1d_machine_view(cpu_id_t start, num_points_t num_points, int " + "stride)") { + MachineView result = make_1d_machine_view(start_cpu, num_points_t{11}, 4); + MachineView correct = cpu_mv; + CHECK(result == correct); + } + SUBCASE("make_1d_machine_view(cpu_id_t start, side_size_t interval_size, " + "int stride)") { + MachineView result = make_1d_machine_view( + start_cpu, get_side_size(rect.sides.at(ff_dim_t{0})), 4); + MachineView correct = cpu_mv; + CHECK(result == correct); + } + } +} diff --git a/lib/pcg/test/src/test_strided_rectangle.cc b/lib/pcg/test/src/test_strided_rectangle.cc new file mode 100644 index 0000000000..ef342944de --- /dev/null +++ b/lib/pcg/test/src/test_strided_rectangle.cc @@ -0,0 +1,37 @@ +#include "doctest/doctest.h" +#include "pcg/strided_rectangle.h" +#include "pcg/strided_rectangle_side.h" + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("get_side_size(StridedRectangleSide)") { + StridedRectangleSide side{num_points_t{7}, 5}; + + CHECK(get_side_size(side) == side_size_t{7 * 5}); + } + TEST_CASE("strided_side_from_size_and_stride") { + StridedRectangleSide correct{num_points_t{10}, 3}; + StridedRectangleSide result = + strided_side_from_size_and_stride(side_size_t{10 * 3}, 3); + CHECK(result == correct); + } + + TEST_CASE("StridedRectangle - helper functions") { + + StridedRectangleSide s0{num_points_t{7}, 5}; + StridedRectangleSide s1{num_points_t{10}, 2}; + StridedRectangleSide s2{num_points_t{8}, 1}; + StridedRectangle rect{{s0, s1, s2}}; + + SUBCASE("get_num_dims") { + CHECK(get_num_dims(rect) == 3); + } + SUBCASE("get_num_points") { + CHECK(get_num_points(rect) == num_points_t{7 * 8 * 10}); + } + SUBCASE("get_side_at_idx") { + CHECK(get_side_at_idx(rect, ff_dim_t{0}) == s0); + CHECK(get_side_at_idx(rect, ff_dim_t{1}) == s1); + CHECK(get_side_at_idx(rect, ff_dim_t{2}) == s2); + } + } +} From e6e216191e16af51880ba97c3e80c770664b85cd Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Wed, 31 Jul 2024 04:49:13 -0700 Subject: [PATCH 06/20] test utils logic cleanup, reverse cpu_kernel pedagogical implmentation, other minor fixes --- lib/kernels/include/kernels/accessor.h | 26 ++- lib/kernels/include/kernels/allocation.h | 9 + .../include/kernels/cast_kernels_cpu.h | 18 +- .../include/kernels/combine_kernels_cpu.h | 10 +- .../include/kernels/local_cpu_allocator.h | 1 + .../include/kernels/local_cuda_allocator.h | 1 + .../include/kernels/replicate_kernels_cpu.h | 12 +- .../include/kernels/reverse_kernels_cpu.h | 26 ++- lib/kernels/src/accessor.cc | 6 +- lib/kernels/src/allocation.cc | 14 +- lib/kernels/src/array_shape.cc | 6 + lib/kernels/src/cpu/cast_kernels.cc | 35 ++-- lib/kernels/src/cpu/combine_kernels.cc | 18 +- lib/kernels/src/cpu/replicate_kernels.cc | 30 ++-- lib/kernels/src/cpu/reverse_kernels.cc | 88 ++++++---- lib/kernels/src/local_cpu_allocator.cc | 18 +- lib/kernels/src/local_cuda_allocator.cc | 11 +- lib/kernels/test/src/test_attention_kernel.cc | 51 +++--- .../test/src/test_batch_matmul_kernel.cc | 18 +- .../test/src/test_batch_norm_kernel.cc | 51 +++--- lib/kernels/test/src/test_cast_kernel.cc | 58 ++++--- lib/kernels/test/src/test_combine_kernel.cc | 60 ++++--- lib/kernels/test/src/test_concat_kernel.cc | 19 +-- lib/kernels/test/src/test_dropout.cc | 15 +- lib/kernels/test/src/test_flat_kernel.cc | 8 +- lib/kernels/test/src/test_gather_kernels.cc | 26 +-- .../test/src/test_layer_norm_kernels.cc | 18 +- lib/kernels/test/src/test_partition_kernel.cc | 8 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 24 +-- lib/kernels/test/src/test_reduction_kernel.cc | 19 +-- lib/kernels/test/src/test_replicate_kernel.cc | 56 +++--- lib/kernels/test/src/test_reshape_kernel.cc | 8 +- lib/kernels/test/src/test_reverse_kernels.cc | 49 +++--- lib/kernels/test/src/test_softmax_kernel.cc | 15 +- lib/kernels/test/src/test_split_kernel.cc | 10 +- lib/kernels/test/src/test_transpose_kernel.cc | 20 +-- lib/kernels/test/src/test_utils.cc | 33 ++-- lib/kernels/test/src/test_utils.h | 160 +++++++++++------- .../local-execution/tracked_allocator.h | 1 + lib/local-execution/src/tracked_allocator.cc | 10 +- 40 files changed, 590 insertions(+), 476 deletions(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 1ef121fb2a..14aa7bb010 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -29,15 +29,20 @@ class GenericTensorAccessorW { double *get_double_ptr() const; half *get_half_ptr() const; + GenericTensorAccessorW(DataType dt, + ArrayShape sh, + req p, + bool on_dev = true) + : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {} + public: DataType data_type; ArrayShape shape; req ptr; + bool on_device; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW, - data_type, - shape, - ptr); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( + GenericTensorAccessorW, data_type, shape, ptr, on_device); class GenericTensorAccessorR { public: @@ -57,15 +62,20 @@ class GenericTensorAccessorR { double const *get_double_ptr() const; half const *get_half_ptr() const; + GenericTensorAccessorR(DataType dt, + ArrayShape sh, + req p, + bool on_dev = true) + : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {} + public: DataType data_type; ArrayShape shape; req ptr; + bool on_device; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR, - data_type, - shape, - ptr); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( + GenericTensorAccessorR, data_type, shape, ptr, on_device); int32_t *get_int32_ptr(GenericTensorAccessorW const &); int64_t *get_int64_ptr(GenericTensorAccessorW const &); diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h index 6500899394..452ccc47b0 100644 --- a/lib/kernels/include/kernels/allocation.h +++ b/lib/kernels/include/kernels/allocation.h @@ -5,10 +5,13 @@ #include #include +enum class AllocLocation { HOST, DEVICE }; + namespace FlexFlow { struct IAllocator { virtual void *allocate(size_t) = 0; + virtual void *allocate_and_zero(size_t) = 0; virtual void deallocate(void *) = 0; virtual ~IAllocator() = default; @@ -18,7 +21,11 @@ struct Allocator { Allocator() = delete; GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape); + GenericTensorAccessorW + allocate_tensor_and_zero(TensorShape const &tensor_shape); + void *allocate(size_t mem_size); + void *allocate_and_zero(size_t mem_size); void deallocate(void *ptr); template @@ -30,6 +37,8 @@ struct Allocator { Allocator(std::shared_ptr ptr) : i_allocator(ptr){}; + AllocLocation alloc_location; + private: std::shared_ptr i_allocator; }; diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h index df4ef22b93..cae0c9da8d 100644 --- a/lib/kernels/include/kernels/cast_kernels_cpu.h +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -7,19 +7,17 @@ namespace FlexFlow { namespace Kernels { namespace Cast { -namespace CPU { -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type); -void backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); +void cpu_backward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type); -} // namespace CPU } // namespace Cast } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h index 1d30297af1..66c22ddbf8 100644 --- a/lib/kernels/include/kernels/combine_kernels_cpu.h +++ b/lib/kernels/include/kernels/combine_kernels_cpu.h @@ -7,15 +7,13 @@ namespace FlexFlow { namespace Kernels { namespace Combine { -namespace CPU { -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); -void backward_kernel(GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad); +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); -} // namespace CPU } // namespace Combine } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h index 27dcc9d854..121ed184e9 100644 --- a/lib/kernels/include/kernels/local_cpu_allocator.h +++ b/lib/kernels/include/kernels/local_cpu_allocator.h @@ -10,6 +10,7 @@ struct LocalCPUAllocator : public IAllocator { ~LocalCPUAllocator() override; void *allocate(size_t) override; + void *allocate_and_zero(size_t) override; void deallocate(void *) override; private: diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h index 18a4b6e78a..16f60daead 100644 --- a/lib/kernels/include/kernels/local_cuda_allocator.h +++ b/lib/kernels/include/kernels/local_cuda_allocator.h @@ -10,6 +10,7 @@ struct LocalCudaAllocator : public IAllocator { ~LocalCudaAllocator() override; void *allocate(size_t) override; + void *allocate_and_zero(size_t) override; void deallocate(void *) override; private: diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h index 4bc97f00ef..11d2f1bf5c 100644 --- a/lib/kernels/include/kernels/replicate_kernels_cpu.h +++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h @@ -7,16 +7,14 @@ namespace FlexFlow { namespace Kernels { namespace Replicate { -namespace CPU { -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); -void backward_kernel(GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output, - size_t num_replicas); +void cpu_backward_kernel(GenericTensorAccessorW const &input, + GenericTensorAccessorR const &output, + size_t num_replicas); -} // namespace CPU } // namespace Replicate } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h index 89ed6ffdb4..bb17aa9400 100644 --- a/lib/kernels/include/kernels/reverse_kernels_cpu.h +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -6,22 +6,20 @@ namespace FlexFlow { namespace Kernels { namespace Reverse { -namespace CPU { -void forward_kernel(float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size); +void cpu_forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t output_size); -void backward_kernel(float const *out_grad_ptr, - float *in_grad_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t input_size); -} // namespace CPU +void cpu_backward_kernel(float const *out_grad_ptr, + float *in_grad_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t input_size); } // namespace Reverse } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index 56002718b1..01514ab679 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -134,8 +134,10 @@ std::vector GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &writable) { - return GenericTensorAccessorR{ - writable.data_type, writable.shape, req(writable.ptr)}; + return GenericTensorAccessorR{writable.data_type, + writable.shape, + req(writable.ptr), + writable.on_device}; } } // namespace FlexFlow diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index a892e14a54..c1c272fbbe 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -6,6 +6,10 @@ void *Allocator::allocate(size_t mem_size) { return this->i_allocator->allocate(mem_size); } +void *Allocator::allocate_and_zero(size_t mem_size) { + return this->i_allocator->allocate_and_zero(mem_size); +} + void Allocator::deallocate(void *ptr) { this->i_allocator->deallocate(ptr); } @@ -13,7 +17,15 @@ void Allocator::deallocate(void *ptr) { GenericTensorAccessorW Allocator::allocate_tensor(TensorShape const &tensor_shape) { void *ptr = this->allocate(get_size_in_bytes(tensor_shape)); - return {tensor_shape.data_type, tensor_shape, ptr}; + bool on_device = this->alloc_location == AllocLocation::DEVICE; + return {tensor_shape.data_type, tensor_shape, ptr, on_device}; +} + +GenericTensorAccessorW + Allocator::allocate_tensor_and_zero(TensorShape const &tensor_shape) { + void *ptr = this->allocate_and_zero(get_size_in_bytes(tensor_shape)); + bool on_device = this->alloc_location == AllocLocation::DEVICE; + return {tensor_shape.data_type, tensor_shape, ptr, on_device}; } } // namespace FlexFlow diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 5410726e0a..0aae2a8ddd 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -60,4 +60,10 @@ size_t get_volume(ArrayShape const &shape) { return shape.get_volume(); } +TensorShape get_tensor_shape(ArrayShape const &shape, DataType DT) { + FFOrdered ff_dims(shape.dims.begin(), shape.dims.end()); + TensorDims tensor_shape_dims(ff_dims); + return TensorShape(tensor_shape_dims, DT); +} + } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc index cf73a84b93..5888d9a96a 100644 --- a/lib/kernels/src/cpu/cast_kernels.cc +++ b/lib/kernels/src/cpu/cast_kernels.cc @@ -4,56 +4,55 @@ namespace FlexFlow { namespace Kernels { namespace Cast { -namespace CPU { template -void cast_forward(IDT const *input, ODT *output, size_t volume) { +void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) { for (size_t i = 0; i < volume; ++i) { output[i] = static_cast(input[i]); } } template -void cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) { +void cpu_cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) { for (size_t i = 0; i < volume; i++) { output[i] = static_cast(input[i]) + beta * output[i]; } } template -struct ForwardKernel { +struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { size_t volume = input.shape.get_volume(); - cast_forward(input.get(), output.get(), volume); + cpu_cast_forward(input.get(), output.get(), volume); } }; template -struct BackwardKernel { +struct CPUBackwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { size_t volume = input.shape.get_volume(); - cast_backward( + cpu_cast_backward( input.get(), output.get(), volume, cast_to(1.0f)); } }; -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { - DataTypeDispatch2{}(input_type, output_type, input, output); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type) { + DataTypeDispatch2{}(input_type, output_type, input, output); } -void backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { - DataTypeDispatch2{}(input_type, output_type, input, output); +void cpu_backward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + DataType input_type, + DataType output_type) { + DataTypeDispatch2{}( + input_type, output_type, input, output); } -} // namespace CPU } // namespace Cast } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc index f1950a56d2..e48f4c3e01 100644 --- a/lib/kernels/src/cpu/combine_kernels.cc +++ b/lib/kernels/src/cpu/combine_kernels.cc @@ -4,10 +4,9 @@ namespace FlexFlow { namespace Kernels { namespace Combine { -namespace CPU { template -struct ForwardKernel { +struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { memcpy(output.get
(), @@ -17,7 +16,7 @@ struct ForwardKernel { }; template -struct BackwardKernel { +struct CPUBackwardKernel { void operator()(GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad) { size_t num_elements = output_grad.shape.get_volume(); @@ -27,18 +26,17 @@ struct BackwardKernel { } }; -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(input.data_type, input, output); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, input, output); } -void backward_kernel(GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad) { - DataTypeDispatch1{}( +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + DataTypeDispatch1{}( input_grad.data_type, output_grad, input_grad); } -} // namespace CPU } // namespace Combine } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index a26d2054d1..239baf4041 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -4,13 +4,12 @@ namespace FlexFlow { namespace Kernels { namespace Replicate { -namespace CPU { template -void replicate_backward_kernel(T *input, - T const *output, - size_t num_elements, - size_t num_replicas) { +void cpu_replicate_backward_kernel(T *input, + T const *output, + size_t num_elements, + size_t num_replicas) { for (size_t i = 0; i < num_elements; ++i) { T sum = 0; for (size_t j = 0; j < num_replicas; ++j) { @@ -23,7 +22,7 @@ void replicate_backward_kernel(T *input, // Why does replicate forward seem to only transfer memory? Shouldn't it also // handle the replication? template -struct ForwardKernel { +struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { memcpy(output.get(), @@ -33,29 +32,28 @@ struct ForwardKernel { }; template -struct BackwardKernel { +struct CPUBackwardKernel { void operator()(GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, size_t num_replicas) { size_t total_elements = input.shape.num_elements() * num_replicas; - replicate_backward_kernel( + cpu_replicate_backward_kernel( input.get(), output.get(), total_elements, num_replicas); } }; -void forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(input.data_type, input, output); +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, input, output); } -void backward_kernel(GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output, - size_t num_replicas) { - DataTypeDispatch1{}( +void cpu_backward_kernel(GenericTensorAccessorW const &input, + GenericTensorAccessorR const &output, + size_t num_replicas) { + DataTypeDispatch1{}( input.data_type, input, output, num_replicas); } -} // namespace CPU } // namespace Replicate } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index b035f03721..350dad03e9 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -1,48 +1,78 @@ #include "kernels/reverse_kernels_cpu.h" +#include +#include namespace FlexFlow { namespace Kernels { namespace Reverse { -namespace CPU { -void reverse_forward_kernel(float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size) { +void cpu_reverse_forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size) { coord_t total_elements = num_out_blks * reverse_dim_size * in_blk_size; - for (coord_t i = 0; i < total_elements; ++i) { - coord_t blk_idx = i / (reverse_dim_size * in_blk_size); - coord_t offset = i - blk_idx * (reverse_dim_size * in_blk_size); - coord_t reverse_dim_idx = offset / in_blk_size; - coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + - (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + - (offset % in_blk_size); - out_ptr[i] = in_ptr[in_idx]; + + std::vector> in_blocks(num_out_blks * reverse_dim_size, + std::vector(in_blk_size)); + + // For each output block, copy the input block into in_blocks + for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { + // Each output block has reverse_dim_size input blocks + for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) { + coord_t start_idx = (blk_idx * reverse_dim_size + rev_idx) * in_blk_size; + + // Copy elements from in_ptr to the current block in in_blocks + std::vector ¤t_block = + in_blocks[blk_idx * reverse_dim_size + rev_idx]; + for (coord_t i = 0; i < in_blk_size; ++i) { + current_block[i] = in_ptr[start_idx + i]; + } + } + } + + // Reverse the in_blocks within each output block + for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { + auto block_start = in_blocks.begin() + blk_idx * reverse_dim_size; + auto block_end = block_start + reverse_dim_size; + std::reverse(block_start, block_end); + } + + // Copy the reversed blocks to the output array + for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { + for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) { + coord_t start_idx = (blk_idx * reverse_dim_size + rev_idx) * in_blk_size; + + // Copy elements from the current block in in_blocks to out_ptr + std::vector const ¤t_block = + in_blocks[blk_idx * reverse_dim_size + rev_idx]; + for (coord_t i = 0; i < in_blk_size; ++i) { + out_ptr[start_idx + i] = current_block[i]; + } + } } } -void forward_kernel(float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size) { - reverse_forward_kernel( +void cpu_forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t output_size) { + cpu_reverse_forward_kernel( in_ptr, out_ptr, num_out_blks, reverse_dim_size, in_blk_size); } -void backward_kernel(float const *out_grad_ptr, - float *in_grad_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t input_size) { - reverse_forward_kernel( +void cpu_backward_kernel(float const *out_grad_ptr, + float *in_grad_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t input_size) { + cpu_reverse_forward_kernel( out_grad_ptr, in_grad_ptr, num_out_blks, reverse_dim_size, in_blk_size); } -} // namespace CPU } // namespace Reverse } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc index 9cc86c44ca..ced707edcc 100644 --- a/lib/kernels/src/local_cpu_allocator.cc +++ b/lib/kernels/src/local_cpu_allocator.cc @@ -3,6 +3,18 @@ namespace FlexFlow { void *LocalCPUAllocator::allocate(size_t requested_memory_size) { + void *ptr = malloc(requested_memory_size); + + if (ptr != nullptr) { + this->ptrs.insert(ptr); + } else { + throw std::bad_alloc(); + } + + return ptr; +} + +void *LocalCPUAllocator::allocate_and_zero(size_t requested_memory_size) { void *ptr = calloc(1, requested_memory_size); if (ptr != nullptr) { @@ -25,13 +37,15 @@ void LocalCPUAllocator::deallocate(void *ptr) { } LocalCPUAllocator::~LocalCPUAllocator() { - for (auto ptr : ptrs) { + for (void *ptr : this->ptrs) { free(ptr); } } Allocator create_local_cpu_memory_allocator() { - return Allocator::create(); + Allocator allocator = Allocator::create(); + allocator.alloc_location = AllocLocation::HOST; + return allocator; } } // namespace FlexFlow diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc index c82abc765d..c93e32734c 100644 --- a/lib/kernels/src/local_cuda_allocator.cc +++ b/lib/kernels/src/local_cuda_allocator.cc @@ -3,6 +3,13 @@ namespace FlexFlow { void *LocalCudaAllocator::allocate(size_t requested_memory_size) { + void *ptr; + checkCUDA(cudaMalloc(&ptr, requested_memory_size)); + this->ptrs.insert(ptr); + return ptr; +} + +void *LocalCudaAllocator::allocate_and_zero(size_t requested_memory_size) { void *ptr; checkCUDA(cudaMalloc(&ptr, requested_memory_size)); checkCUDA(cudaMemset(ptr, 0, requested_memory_size)); @@ -27,7 +34,9 @@ LocalCudaAllocator::~LocalCudaAllocator() { } Allocator create_local_cuda_memory_allocator() { - return Allocator::create(); + Allocator allocator = Allocator::create(); + allocator.alloc_location = AllocLocation::DEVICE; + return allocator; } } // namespace FlexFlow diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index c37b83fa24..bbb3c62a85 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -33,28 +33,28 @@ TEST_SUITE(FF_TEST_SUITE) { kvSeqLength, false); - TensorShape query_shape = - make_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, qSize}); - TensorShape key_shape = make_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, kSize}); - TensorShape value_shape = - make_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, vSize}); - TensorShape output_shape = - make_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, oProjSize}); + TensorShape query_shape = make_tensor_shape_from_legion_dims( + {qoSeqLength, num_samples, qSize}, DataType::FLOAT); + TensorShape key_shape = make_tensor_shape_from_legion_dims( + {kvSeqLength, num_samples, kSize}, DataType::FLOAT); + TensorShape value_shape = make_tensor_shape_from_legion_dims( + {kvSeqLength, num_samples, vSize}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {qoSeqLength, num_samples, oProjSize}, DataType::FLOAT); TensorShape weight_shape = - make_tensor_shape_from_legion_dims({state.weightSize}); + make_tensor_shape_from_legion_dims({state.weightSize}, DataType::FLOAT); GenericTensorAccessorW query_accessor = - create_random_filled_accessor_w(query_shape, allocator); + create_random_filled_accessor_w(query_shape, + allocator); GenericTensorAccessorW key_accessor = - create_random_filled_accessor_w(key_shape, allocator); + create_random_filled_accessor_w(key_shape, allocator); GenericTensorAccessorW value_accessor = - create_random_filled_accessor_w(value_shape, allocator); + create_random_filled_accessor_w(value_shape, + allocator); GenericTensorAccessorW weight_accessor = - create_random_filled_accessor_w(weight_shape, allocator); + create_random_filled_accessor_w(weight_shape, + allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -69,22 +69,27 @@ TEST_SUITE(FF_TEST_SUITE) { weight_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output)); } SUBCASE("backward_kernel") { GenericTensorAccessorW query_grad_accessor = - create_random_filled_accessor_w(query_shape, allocator); + create_random_filled_accessor_w(query_shape, + allocator); GenericTensorAccessorW key_grad_accessor = - create_random_filled_accessor_w(key_shape, allocator); + create_random_filled_accessor_w(key_shape, + allocator); GenericTensorAccessorW value_grad_accessor = - create_random_filled_accessor_w(value_shape, allocator); + create_random_filled_accessor_w(value_shape, + allocator); GenericTensorAccessorW weight_grad_accessor = - create_random_filled_accessor_w(weight_shape, allocator); + create_random_filled_accessor_w(weight_shape, + allocator); GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); Kernels::MultiHeadAttention::backward_kernel( managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index 51a50e6cf2..e64941b574 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -20,18 +20,21 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape_a = - make_tensor_shape_from_legion_dims({m, k, batch}); + make_tensor_shape_from_legion_dims({m, k, batch}, DataType::FLOAT); TensorShape input_shape_b = - make_tensor_shape_from_legion_dims({k, n, batch}); + make_tensor_shape_from_legion_dims({k, n, batch}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({m, n, batch}); + make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT); GenericTensorAccessorW a_accessor = - create_random_filled_accessor_w(input_shape_a, allocator); + create_random_filled_accessor_w(input_shape_a, + allocator); GenericTensorAccessorW b_accessor = - create_random_filled_accessor_w(input_shape_b, allocator); + create_random_filled_accessor_w(input_shape_b, + allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); SUBCASE("forward_kernel") { Kernels::BatchMatmul::forward_kernel(managed_stream.raw_stream(), @@ -50,7 +53,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW o_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW a_grad_accessor = allocator.allocate_tensor(input_shape_a); GenericTensorAccessorW b_grad_accessor = diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 0d4682996a..5135d703fd 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -23,23 +23,21 @@ TEST_SUITE(FF_TEST_SUITE) { output_w, true); - TensorShape input_shape = - make_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape output_shape = - make_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape scale_shape = - make_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape bias_shape = - make_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); + TensorShape scale_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); + TensorShape bias_shape = make_tensor_shape_from_legion_dims( + {output_n, output_c, output_h, output_w}, DataType::FLOAT); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW scale_accessor = create_filled_accessor_w(scale_shape, allocator, 1.0f); @@ -54,20 +52,24 @@ TEST_SUITE(FF_TEST_SUITE) { scale_accessor.get_float_ptr(), bias_accessor.get_float_ptr()); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW scale_grad_accessor = - create_random_filled_accessor_w(scale_shape, allocator); + create_random_filled_accessor_w(scale_shape, + allocator); GenericTensorAccessorW bias_grad_accessor = - create_random_filled_accessor_w(bias_shape, allocator); + create_random_filled_accessor_w(bias_shape, + allocator); Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(), state, @@ -81,14 +83,11 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.shape.num_elements()); std::vector host_input_grad_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); std::vector host_scale_grad_data = - load_accessor_data( - read_only_accessor_from_write_accessor(scale_grad_accessor)); + load_accessor_data(scale_grad_accessor); std::vector host_bias_grad_data = - load_accessor_data( - read_only_accessor_from_write_accessor(bias_grad_accessor)); + load_accessor_data(bias_grad_accessor); CHECK(contains_non_zero(host_input_grad_data)); CHECK(contains_non_zero(host_scale_grad_data)); diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index e7da356564..4e54aa2e1c 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -12,17 +12,16 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({100, 100}); - - GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + make_tensor_shape_from_legion_dims({100, 100}, DataType::DOUBLE); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, + allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); Kernels::Cast::forward_kernel(managed_stream.raw_stream(), input_accessor, @@ -31,26 +30,26 @@ TEST_SUITE(FF_TEST_SUITE) { DataType::DOUBLE); std::vector host_double_data = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_double_data)); } SUBCASE("backward_kernel") { + GenericTensorAccessorR grad_output_accessor = + create_random_filled_accessor_r(output_shape, + allocator); GenericTensorAccessorW grad_input_accessor = allocator.allocate_tensor(input_shape); - Kernels::Cast::backward_kernel( - managed_stream.raw_stream(), - read_only_accessor_from_write_accessor(output_accessor), - grad_input_accessor, - DataType::DOUBLE, - DataType::FLOAT); + Kernels::Cast::backward_kernel(managed_stream.raw_stream(), + grad_output_accessor, + grad_input_accessor, + DataType::DOUBLE, + DataType::FLOAT); std::vector host_grad_float_data = - load_accessor_data( - read_only_accessor_from_write_accessor(grad_input_accessor)); + load_accessor_data(grad_input_accessor); CHECK(contains_non_zero(host_grad_float_data)); } } @@ -62,9 +61,9 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::INT32); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(output_shape); @@ -81,31 +80,34 @@ TEST_SUITE(FF_TEST_SUITE) { // Run GPU Forward Kernel GenericTensorAccessorW input_accessor_gpu = create_transformed_accessor_w( - input_shape, gpu_allocator, transform, false); + input_shape, gpu_allocator, transform); Kernels::Cast::forward_kernel( managed_stream.raw_stream(), read_only_accessor_from_write_accessor(input_accessor_gpu), output_accessor_gpu, DataType::FLOAT, DataType::INT32); + std::cout << "Before GPU load" << std::endl; std::vector result_data_gpu = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), - false); + load_accessor_data(output_accessor_gpu); // Run CPU Forward Kernel GenericTensorAccessorW input_accessor_cpu = create_transformed_accessor_w( - input_shape, cpu_allocator, transform, true); - Kernels::Cast::CPU::forward_kernel( + input_shape, cpu_allocator, transform); + Kernels::Cast::cpu_forward_kernel( read_only_accessor_from_write_accessor(input_accessor_cpu), output_accessor_cpu, DataType::FLOAT, DataType::INT32); + std::cout << "Before CPU load" << std::endl; + if (output_accessor_cpu.on_device) { + std::cout << "CPU data is on device" << std::endl; + } else { + std::cout << "CPU data is on host" << std::endl; + } std::vector result_data_cpu = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), - true); + load_accessor_data(output_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 60c55ca062..aeceb1ef4d 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -12,28 +12,28 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100, 100}); + make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, + allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Combine::forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -41,8 +41,8 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor, input_grad_accessor); - std::vector host_input_grad = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + std::vector host_input_grad = + load_accessor_data(input_grad_accessor); CHECK(contains_non_zero(host_input_grad)); } } @@ -54,36 +54,36 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({5, 5}); + make_tensor_shape_from_legion_dims({5, 5}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { // Run GPU Combine Forward Kernel GenericTensorAccessorR input_accessor_gpu = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, gpu_allocator)); + create_random_filled_accessor_r(input_shape, + gpu_allocator); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(output_shape); Kernels::Combine::forward_kernel( managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), false); + std::vector result_data_gpu = + load_accessor_data(output_accessor_gpu); // Run CPU Combine Forward Kernel GenericTensorAccessorW input_accessor_cpu = - copy_tensor_between_memories( - input_accessor_gpu, input_shape, cpu_allocator); + copy_tensor_between_memories(input_accessor_gpu, + cpu_allocator); GenericTensorAccessorW output_accessor_cpu = cpu_allocator.allocate_tensor(output_shape); - Kernels::Combine::CPU::forward_kernel( + Kernels::Combine::cpu_forward_kernel( read_only_accessor_from_write_accessor(input_accessor_cpu), output_accessor_cpu); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), true); + std::vector result_data_cpu = + load_accessor_data(output_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } @@ -91,33 +91,31 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { // Run GPU Combine Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, gpu_allocator)); + create_random_filled_accessor_r(output_shape, + gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor(input_shape); + gpu_allocator.allocate_tensor_and_zero(input_shape); Kernels::Combine::backward_kernel(managed_stream.raw_stream(), output_grad_accessor_gpu, input_grad_accessor_gpu); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - false); + std::vector result_data_gpu = + load_accessor_data(input_grad_accessor_gpu); // Run CPU Combine Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = copy_tensor_between_memories( - output_grad_accessor_gpu, output_shape, cpu_allocator); + output_grad_accessor_gpu, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor(input_shape); + cpu_allocator.allocate_tensor_and_zero(input_shape); - Kernels::Combine::CPU::backward_kernel( + Kernels::Combine::cpu_backward_kernel( read_only_accessor_from_write_accessor(output_grad_accessor_cpu), input_grad_accessor_cpu); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - true); + std::vector result_data_cpu = + load_accessor_data(input_grad_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 04bd4b5929..8754381850 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -13,18 +13,17 @@ TEST_SUITE(FF_TEST_SUITE) { ManagedFFStream managed_stream{}; TensorShape input_shape = - make_tensor_shape_from_legion_dims({size_per_input}); - TensorShape output_shape = - make_tensor_shape_from_legion_dims( - {size_per_input, num_inputs}); + make_tensor_shape_from_legion_dims({size_per_input}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {size_per_input, num_inputs}, DataType::FLOAT); Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { std::vector input_accessors = repeat(num_inputs, [&]() { - return read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + return create_random_filled_accessor_r(input_shape, + allocator); }); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -34,16 +33,16 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessors, concat_axis); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, + allocator); std::vector input_grad_accessors = repeat( num_inputs, [&]() { return allocator.allocate_tensor(input_shape); }); diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index c944a80b02..8237e61729 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, 10}); + make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT); TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; @@ -31,8 +31,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, + allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -42,17 +42,18 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor.get_float_ptr()); std::vector host_output_accessor = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_data = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW input_grad_data = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Dropout::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 3f8ef38f0b..5c88110fde 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -11,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) { ManagedFFStream managed_stream{}; TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}); + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = input_shape; GenericTensorAccessorR input_accessor = @@ -27,8 +27,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor.get_float_ptr()); std::vector check_output_data = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + load_accessor_data(output_accessor); std::vector expected_output_data( input_accessor.shape.num_elements(), 2.0f); @@ -47,8 +46,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor.get_float_ptr()); std::vector backward_output_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); std::vector expected_output_data( input_accessor.shape.num_elements(), 1.0f); diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index cfabef7ab2..b8c4da0df2 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -13,18 +13,18 @@ TEST_SUITE(FF_TEST_SUITE) { GatherPerDeviceState state = {managed_handle.raw_handle(), legion_dim_t(2)}; TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}); + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({50}); + make_tensor_shape_from_legion_dims({50}, DataType::FLOAT); GenericTensorAccessorR index_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, + allocator); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, + allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -34,17 +34,18 @@ TEST_SUITE(FF_TEST_SUITE) { index_accessor, output_accessor); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Gather::backward_kernel(managed_stream.raw_stream(), state, @@ -53,8 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor); std::vector host_input_grad_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); CHECK(contains_non_zero(host_input_grad_data)); } } diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 5bb589607b..651959d171 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -11,12 +11,11 @@ TEST_SUITE(FF_TEST_SUITE) { float epsilon = 1e-5f; bool elementwise_affine = true; - TensorShape input_shape = - make_tensor_shape_from_legion_dims( - {batch_size, feature_size}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {batch_size, feature_size}, DataType::FLOAT); TensorShape output_shape = input_shape; TensorShape feature_shape = - make_tensor_shape_from_legion_dims({feature_size}); + make_tensor_shape_from_legion_dims({feature_size}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -32,8 +31,8 @@ TEST_SUITE(FF_TEST_SUITE) { epsilon); GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, + allocator); GenericTensorAccessorW gamma_accessor = create_filled_accessor_w(feature_shape, allocator, 1.0f); @@ -53,10 +52,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW gamma_grad_accessor = allocator.allocate_tensor(feature_shape); GenericTensorAccessorW beta_grad_accessor = diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 1e009b205a..d34101d349 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_handle.raw_handle(), DataType::FLOAT); TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, 10}); + make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { @@ -29,8 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_stream.raw_stream(), state, input_accessor, output_accessor); std::vector check_output_data = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + load_accessor_data(output_accessor); std::vector expected_output_data( input_accessor.shape.num_elements(), 1.0f); @@ -50,8 +49,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor); std::vector host_grad_input_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); std::vector expected_grad_input_data( input_grad_accessor.shape.num_elements(), 3.0f); diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index d6df1daa4a..e014accfd3 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -36,17 +36,17 @@ TEST_SUITE(FF_TEST_SUITE) { stride_w, pool_type); - TensorShape input_shape = - make_tensor_shape_from_legion_dims( - {input_w, input_h, input_c, input_n}); - TensorShape output_shape = - make_tensor_shape_from_legion_dims( - {output_w, output_h, output_c, output_n}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {input_w, input_h, input_c, input_n}, DataType::FLOAT); + TensorShape output_shape = make_tensor_shape_from_legion_dims( + {output_w, output_h, output_c, output_n}, DataType::FLOAT); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); SUBCASE("forward_kernel") { Kernels::Pool2D::forward_kernel(managed_stream.raw_stream(), @@ -54,8 +54,8 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.ptr, output_accessor.ptr); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } @@ -72,8 +72,8 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor.ptr, output_grad_accessor.ptr); - std::vector host_input_grad = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + std::vector host_input_grad = + load_accessor_data(input_grad_accessor); CHECK(contains_non_zero(host_input_grad)); } } diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 5dcf85e39d..989ffde163 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -7,9 +7,8 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Reduction Forward and Backward Kernel") { std::size_t num_replicas = 5; - TensorShape input_shape = - make_tensor_shape_from_legion_dims( - {10, 10, 10, 10, 10}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {10, 10, 10, 10, 10}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -18,11 +17,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { TensorShape output_shape = - make_tensor_shape_from_legion_dims({10}); + make_tensor_shape_from_legion_dims({10}, DataType::FLOAT); GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, + allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -31,8 +30,8 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor, num_replicas); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } @@ -51,8 +50,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector expected_grad_input_data( input_grad_accessor.shape.num_elements(), 1.0f); - std::vector host_grad_data = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + std::vector host_grad_data = + load_accessor_data(input_grad_accessor); CHECK(host_grad_data == expected_grad_input_data); } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 49807355e1..315a1c3489 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -9,9 +9,9 @@ TEST_SUITE(FF_TEST_SUITE) { std::size_t num_replicas = 10; TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}); + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({100}); + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -29,8 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_stream.raw_stream(), input_accessor, output_accessor); std::vector check_output_data = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + load_accessor_data(output_accessor); std::vector expected_output_data( input_accessor.shape.num_elements(), 1.0f); @@ -50,8 +49,7 @@ TEST_SUITE(FF_TEST_SUITE) { num_replicas); std::vector check_aggregated_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); CHECK(contains_non_zero(check_aggregated_data)); } } @@ -63,11 +61,11 @@ TEST_SUITE(FF_TEST_SUITE) { // reduced shape, but things are weird cause doesn't seem to be replicating // anything (ie. input shape should be same as reduced shape) TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, num_replicas}); + make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT); TensorShape replicated_shape = - make_tensor_shape_from_legion_dims({10, num_replicas}); + make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT); TensorShape reduced_shape = - make_tensor_shape_from_legion_dims({10}); + make_tensor_shape_from_legion_dims({10}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -78,30 +76,30 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { // Run GPU Replicate Forward Kernel GenericTensorAccessorR input_accessor_gpu = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, gpu_allocator)); + create_random_filled_accessor_r(input_shape, + gpu_allocator); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(replicated_shape); Kernels::Replicate::forward_kernel( managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), false); + std::vector result_data_gpu = + load_accessor_data(output_accessor_gpu); // Run CPU Replicate Forward Kernel GenericTensorAccessorW input_accessor_cpu = - copy_tensor_between_memories( - input_accessor_gpu, input_shape, cpu_allocator); + copy_tensor_between_memories(input_accessor_gpu, + cpu_allocator); GenericTensorAccessorW output_accessor_cpu = cpu_allocator.allocate_tensor(replicated_shape); - Kernels::Replicate::CPU::forward_kernel( + Kernels::Replicate::cpu_forward_kernel( read_only_accessor_from_write_accessor(input_accessor_cpu), output_accessor_cpu); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), true); + std::vector result_data_cpu = + load_accessor_data(output_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } @@ -109,35 +107,33 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { // Run GPU Replicate Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(replicated_shape, gpu_allocator)); + create_random_filled_accessor_r(replicated_shape, + gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor(reduced_shape); + gpu_allocator.allocate_tensor_and_zero(reduced_shape); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), input_grad_accessor_gpu, output_grad_accessor_gpu, num_replicas); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - false); + std::vector result_data_gpu = + load_accessor_data(input_grad_accessor_gpu); // Run CPU Replicate Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = copy_tensor_between_memories( - output_grad_accessor_gpu, replicated_shape, cpu_allocator); + output_grad_accessor_gpu, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor(reduced_shape); + cpu_allocator.allocate_tensor_and_zero(reduced_shape); - Kernels::Replicate::CPU::backward_kernel( + Kernels::Replicate::cpu_backward_kernel( input_grad_accessor_cpu, read_only_accessor_from_write_accessor(output_grad_accessor_cpu), num_replicas); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - true); + std::vector result_data_cpu = + load_accessor_data(input_grad_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index e1a8ccc4b7..e8b3d9d2f5 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -11,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}); + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = input_shape; ReshapePerDeviceState state = @@ -28,8 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) { managed_stream.raw_stream(), state, input_accessor, output_accessor); std::vector check_output_data = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + load_accessor_data(output_accessor); std::vector expected_output_data( input_accessor.shape.num_elements(), 1.0f); @@ -49,8 +48,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor); std::vector host_grad_input_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); std::vector expected_grad_input_data( input_grad_accessor.shape.num_elements(), 3.0f); diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index fc7acc99cd..be1d946902 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -10,9 +10,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::size_t in_blk_size = 10; std::size_t num_out_blks = 1; - TensorShape input_shape = - make_tensor_shape_from_legion_dims( - {num_out_blks, reverse_dim_size, in_blk_size}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; ManagedPerDeviceFFHandle managed_handle{}; @@ -36,15 +35,15 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.shape.num_elements()); std::vector check_output_data = - load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + load_accessor_data(output_accessor); CHECK(contains_non_zero(check_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -58,8 +57,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor.shape.num_elements()); std::vector host_grad_input_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); CHECK(contains_non_zero(host_grad_input_data)); } @@ -70,9 +68,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::size_t reverse_dim_size = 3; std::size_t in_blk_size = 5; - TensorShape input_shape = - make_tensor_shape_from_legion_dims( - {num_out_blks, reverse_dim_size, in_blk_size}); + TensorShape input_shape = make_tensor_shape_from_legion_dims( + {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; ManagedPerDeviceFFHandle managed_handle{}; @@ -89,7 +86,7 @@ TEST_SUITE(FF_TEST_SUITE) { // Run GPU Cast Forward Kernel GenericTensorAccessorW input_accessor_gpu = create_transformed_accessor_w( - input_shape, gpu_allocator, transform, false); + input_shape, gpu_allocator, transform); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(output_shape); @@ -101,17 +98,17 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_accessor_gpu.shape.num_elements()); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_gpu), false); + std::vector result_data_gpu = + load_accessor_data(output_accessor_gpu); // Run CPU Cast Forward Kernel GenericTensorAccessorW input_accessor_cpu = create_transformed_accessor_w( - input_shape, cpu_allocator, transform, true); + input_shape, cpu_allocator, transform); GenericTensorAccessorW output_accessor_cpu = cpu_allocator.allocate_tensor(output_shape); - Kernels::Reverse::CPU::forward_kernel( + Kernels::Reverse::cpu_forward_kernel( input_accessor_cpu.get_float_ptr(), output_accessor_cpu.get_float_ptr(), num_out_blks, @@ -119,8 +116,8 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_accessor_cpu.shape.num_elements()); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor_cpu), true); + std::vector result_data_cpu = + load_accessor_data(output_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } @@ -128,7 +125,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { // Run GPU Cast Backward Kernel GenericTensorAccessorW output_grad_accessor_gpu = - create_random_filled_accessor_w(output_shape, gpu_allocator); + create_random_filled_accessor_w(output_shape, + gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = gpu_allocator.allocate_tensor(input_shape); @@ -141,20 +139,18 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_grad_accessor_gpu.shape.num_elements()); - std::vector result_data_gpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_gpu), - false); + std::vector result_data_gpu = + load_accessor_data(input_grad_accessor_gpu); // Run CPU Cast Backward Kernel GenericTensorAccessorW output_grad_accessor_cpu = copy_tensor_between_memories( read_only_accessor_from_write_accessor(output_grad_accessor_gpu), - output_shape, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = cpu_allocator.allocate_tensor(input_shape); - Kernels::Reverse::CPU::backward_kernel( + Kernels::Reverse::cpu_backward_kernel( output_grad_accessor_cpu.get_float_ptr(), input_grad_accessor_cpu.get_float_ptr(), num_out_blks, @@ -162,9 +158,8 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_grad_accessor_cpu.shape.num_elements()); - std::vector result_data_cpu = load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor_cpu), - true); + std::vector result_data_cpu = + load_accessor_data(input_grad_accessor_cpu); CHECK(result_data_gpu == result_data_cpu); } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index a9f7fa8bc0..c25c2f91d3 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -14,26 +14,28 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}); + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = input_shape; SoftmaxPerDeviceState state = Kernels::Softmax::init_kernel( managed_handle.raw_handle(), 0, input_n, channels, input_h, input_w); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Softmax::forward_kernel(managed_stream.raw_stream(), state, input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } @@ -52,8 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector expected_input_grad_data = std::vector(input_grad_accessor.shape.num_elements(), 1.0f); std::vector host_input_grad_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); CHECK(host_input_grad_data == expected_input_grad_data); } } diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index 304a7ba121..26acbee33c 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -17,13 +17,14 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100}); + make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({50}); + make_tensor_shape_from_legion_dims({50}, DataType::FLOAT); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); std::vector output_ptrs = repeat(num_outputs, [&]() { GenericTensorAccessorW output_accessor = @@ -44,7 +45,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector output_grad_ptrs(num_outputs); for (int i = 0; i < num_outputs; i++) { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, allocator); + create_random_filled_accessor_w(output_shape, + allocator); output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index a4cbf37c4b..2abbd66c8f 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -18,30 +18,31 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Transpose::init_kernel(num_dims, perm); TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, 10}); + make_tensor_shape_from_legion_dims({10, 10}, DataType::FLOAT); TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, + allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Transpose::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector host_output_data = load_accessor_data( - read_only_accessor_from_write_accessor(output_accessor)); + std::vector host_output_data = + load_accessor_data(output_accessor); CHECK(contains_non_zero(host_output_data)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, + allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); + create_random_filled_accessor_w(input_shape, + allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), state, @@ -49,8 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor); std::vector host_grad_input_data = - load_accessor_data( - read_only_accessor_from_write_accessor(input_grad_accessor)); + load_accessor_data(input_grad_accessor); CHECK(contains_non_zero(host_grad_input_data)); } } diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index c9d2bf0a7c..b147523604 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -1,24 +1,19 @@ #include "test_utils.h" -GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool on_host) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); - std::vector host_data(volume); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dist(-1.0f, 1.0f); +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_random_filled_accessor_w(shape, allocator); - for (auto &val : host_data) { - val = dist(gen); - } - - transfer_memory(static_cast(accessor.ptr), - host_data.data(), - volume, - GpuDirection::HostToDevice, - on_host); + return read_only_accessor_from_write_accessor(accessor); +} - return accessor; +TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims, + DataType DT) { + return TensorShape{ + TensorDims{ + dims, + }, + DT, + }; } diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index 1ce9e7a3d7..4426ba2df8 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -8,55 +8,74 @@ #include "kernels/managed_per_device_ff_handle.h" #include -enum class GpuDirection { - HostToDevice = 0, - DeviceToHost = 1, - DeviceToDevice = 2 -}; - template -void transfer_memory(DT *dst, +void transfer_memory(GenericTensorAccessorW dst_accessor, const DT *src, - size_t num_elements, - GpuDirection gpu_dir, - bool cpu_memory) { - size_t bytes = num_elements * sizeof(DT); - - if (cpu_memory) { - memcpy(dst, src, bytes); + AllocLocation src_loc) { + size_t bytes = dst_accessor.shape.get_volume() * sizeof(DT); + AllocLocation dst_loc = + dst_accessor.on_device ? AllocLocation::DEVICE : AllocLocation::HOST; + + if (src_loc == AllocLocation::HOST && dst_loc == AllocLocation::HOST) { + memcpy(dst_accessor.ptr, src, bytes); + } else if (src_loc == AllocLocation::HOST && + dst_loc == AllocLocation::DEVICE) { + checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyHostToDevice)); + } else if (src_loc == AllocLocation::DEVICE && + dst_loc == AllocLocation::HOST) { + checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToHost)); } else { - switch (gpu_dir) { - case GpuDirection::HostToDevice: - checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice)); - break; - case GpuDirection::DeviceToHost: - checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost)); - break; - case GpuDirection::DeviceToDevice: - checkCUDA(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice)); - break; - } + checkCUDA( + cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToDevice)); } } +template GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool on_host = false); + Allocator &allocator) { + assert(shape.data_type == DataType::FLOAT || + shape.data_type == DataType::DOUBLE); + using T = real_type
; + + GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + accessor.on_device = + (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; + + std::vector host_data(accessor.shape.num_elements()); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dist(-1.0, 1.0); + + for (auto &val : host_data) { + val = dist(gen); + } + + transfer_memory(accessor, host_data.data(), AllocLocation::HOST); + + return accessor; +} + +template +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_random_filled_accessor_w
(shape, allocator); + + return read_only_accessor_from_write_accessor(accessor); +} template GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, Allocator &allocator, - DT val, - bool on_host = false) { + DT val) { GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements(); + accessor.on_device = + (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; + + size_t volume = accessor.shape.get_volume(); std::vector
host_data(volume, val); - transfer_memory(static_cast
(accessor.ptr), - host_data.data(), - volume, - GpuDirection::HostToDevice, - on_host); + transfer_memory(accessor, host_data.data(), AllocLocation::HOST); return accessor; } @@ -64,9 +83,11 @@ GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, template GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape, Allocator &allocator, - F transform, - bool on_host = false) { + F transform) { GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + accessor.on_device = + (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; + size_t volume = accessor.shape.get_volume(); std::vector input_data(volume); std::vector output_data(volume); @@ -74,11 +95,7 @@ GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape, std::transform( input_data.begin(), input_data.end(), output_data.begin(), transform); - transfer_memory(static_cast(accessor.ptr), - output_data.data(), - volume, - GpuDirection::HostToDevice, - on_host); + transfer_memory(accessor, output_data.data(), AllocLocation::HOST); return accessor; } @@ -86,42 +103,59 @@ GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape, template GenericTensorAccessorW copy_tensor_between_memories(GenericTensorAccessorR accessor, - TensorShape const &shape, - Allocator &allocator, - bool src_on_host = false) { + Allocator &allocator) { + TensorShape shape = get_tensor_shape(accessor.shape, accessor.data_type); GenericTensorAccessorW copied_accessor = allocator.allocate_tensor(shape); + copied_accessor.on_device = + (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; - size_t volume = accessor.shape.get_volume(); - GpuDirection gpu_dir = - src_on_host ? GpuDirection::HostToDevice : GpuDirection::DeviceToHost; + AllocLocation src_loc = + accessor.on_device ? AllocLocation::DEVICE : AllocLocation::HOST; - transfer_memory( - copied_accessor.get
(), accessor.get
(), volume, gpu_dir, false); + transfer_memory(copied_accessor, accessor.get
(), src_loc); return copied_accessor; } -template -TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DT, - }; -} +TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims, + DataType DT); template -std::vector> load_accessor_data(GenericTensorAccessorR accessor, - bool on_host = false) { +std::vector> load_accessor_data(GenericTensorAccessorR accessor) { + using T = real_type
; + int volume = accessor.shape.get_volume(); + std::vector local_data(volume); + T const *src_ptr = accessor.get
(); + if (accessor.on_device) { + checkCUDA(cudaMemcpy(local_data.data(), + src_ptr, + volume * sizeof(T), + cudaMemcpyDeviceToHost)); + } else { + memcpy(local_data.data(), src_ptr, volume * sizeof(T)); + } + + return local_data; +} + +template +std::vector> load_accessor_data(GenericTensorAccessorW accessor) { using T = real_type
; + + int volume = accessor.shape.get_volume(); std::vector local_data(volume); T const *src_ptr = accessor.get
(); - transfer_memory( - local_data.data(), src_ptr, volume, GpuDirection::DeviceToHost, on_host); + if (accessor.on_device) { + checkCUDA(cudaMemcpy(local_data.data(), + src_ptr, + volume * sizeof(T), + cudaMemcpyDeviceToHost)); + } else { + memcpy(local_data.data(), src_ptr, volume * sizeof(T)); + } return local_data; } diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h index ae7bd076ce..56d3b5550f 100644 --- a/lib/local-execution/include/local-execution/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -12,6 +12,7 @@ struct TrackedAllocator : public IAllocator { ~TrackedAllocator() = default; void *allocate(size_t) override; + void *allocate_and_zero(size_t) override; void deallocate(void *) override; size_t get_current_mem_usage(); diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index 68636906c3..18546ad54b 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -11,6 +11,12 @@ void *TrackedAllocator::allocate(size_t requested_memory_size) { return ptr; } +void *TrackedAllocator::allocate_and_zero(size_t requested_memory_size) { + void *ptr = this->allocator.allocate_and_zero(requested_memory_size); + this->current_mem_usage += requested_memory_size; + return ptr; +} + void TrackedAllocator::deallocate(void *ptr) { size_t psize; checkCUDA(cudaGetSymbolSize(&psize, ptr)); @@ -23,7 +29,9 @@ size_t TrackedAllocator::get_current_mem_usage() { } Allocator get_tracked_memory_allocator(Allocator const &base_allocator) { - return Allocator::create(base_allocator); + Allocator allocator = Allocator::create(base_allocator); + allocator.alloc_location = base_allocator.alloc_location; + return allocator; } } // namespace FlexFlow From c9c33fd6a7f070bdcf840da48f79428438a69c77 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 8 Oct 2024 00:18:45 -0700 Subject: [PATCH 07/20] cpu_kernel's refactor, generic tensor accessor indexing --- lib/kernels/CMakeLists.txt | 1 + lib/kernels/include/kernels/accessor.h | 112 ++++++++++++--- lib/kernels/include/kernels/allocation.h | 12 +- .../include/kernels/attention_kernels.h | 6 +- .../include/kernels/batch_matmul_kernels.h | 8 +- .../include/kernels/batch_norm_kernels.h | 6 +- lib/kernels/include/kernels/cast_kernels.h | 8 +- .../include/kernels/cast_kernels_cpu.h | 8 +- lib/kernels/include/kernels/combine_kernels.h | 8 +- .../include/kernels/combine_kernels_cpu.h | 8 +- lib/kernels/include/kernels/concat_kernels.h | 8 +- lib/kernels/include/kernels/conv_2d_kernels.h | 6 +- .../include/kernels/datatype_dispatch.h | 3 +- lib/kernels/include/kernels/dropout_kernels.h | 6 +- .../include/kernels/element_binary_kernels.h | 6 +- .../include/kernels/element_unary_kernels.h | 6 +- .../include/kernels/embedding_kernels.h | 8 +- lib/kernels/include/kernels/flat_kernels.h | 8 +- lib/kernels/include/kernels/gather_kernels.h | 6 +- .../include/kernels/layer_norm_kernels.h | 6 +- lib/kernels/include/kernels/linear_kernels.h | 6 +- .../include/kernels/local_cpu_allocator.h | 7 +- .../include/kernels/local_cuda_allocator.h | 3 +- lib/kernels/include/kernels/nccl.h | 8 +- .../include/kernels/partition_kernels.h | 6 +- lib/kernels/include/kernels/pool_2d_kernels.h | 6 +- lib/kernels/include/kernels/reduce_kernels.h | 6 +- .../include/kernels/reduction_kernels.h | 8 +- .../include/kernels/replicate_kernels.h | 8 +- .../include/kernels/replicate_kernels_cpu.h | 8 +- lib/kernels/include/kernels/reshape_kernels.h | 6 +- lib/kernels/include/kernels/reverse_kernels.h | 8 +- .../include/kernels/reverse_kernels_cpu.h | 23 ++-- lib/kernels/include/kernels/softmax_kernels.h | 6 +- lib/kernels/include/kernels/split_kernels.h | 9 +- lib/kernels/include/kernels/topk_kernels.h | 6 +- .../include/kernels/transpose_kernels.h | 6 +- lib/kernels/src/accessor.cc | 118 +++++++++++++++- lib/kernels/src/allocation.cc | 19 +-- lib/kernels/src/array_shape.cc | 1 + lib/kernels/src/cpu/cast_kernels.cc | 8 +- lib/kernels/src/cpu/combine_kernels.cc | 8 +- lib/kernels/src/cpu/replicate_kernels.cc | 21 ++- lib/kernels/src/cpu/reverse_kernels.cc | 101 ++++++-------- lib/kernels/src/cuda/ops/concat_kernels.cu | 8 +- lib/kernels/src/local_cpu_allocator.cc | 31 +---- lib/kernels/src/local_cuda_allocator.cc | 13 +- lib/kernels/test/CMakeLists.txt | 1 + lib/kernels/test/src/test_cast_kernel.cc | 27 ++-- lib/kernels/test/src/test_combine_kernel.cc | 30 ++-- lib/kernels/test/src/test_concat_kernel.cc | 9 +- lib/kernels/test/src/test_dropout.cc | 2 +- lib/kernels/test/src/test_flat_kernel.cc | 5 +- lib/kernels/test/src/test_partition_kernel.cc | 6 +- lib/kernels/test/src/test_reduction_kernel.cc | 2 +- lib/kernels/test/src/test_replicate_kernel.cc | 53 ++++--- lib/kernels/test/src/test_reshape_kernel.cc | 5 +- lib/kernels/test/src/test_reverse_kernels.cc | 53 ++++--- lib/kernels/test/src/test_softmax_kernel.cc | 3 +- lib/kernels/test/src/test_split_kernel.cc | 1 + lib/kernels/test/src/test_utils.cc | 83 +++++++++-- lib/kernels/test/src/test_utils.h | 129 +++++++++--------- .../local-execution/local_cpu_allocator.h | 2 + .../local-execution/tracked_allocator.h | 4 +- .../src/local_cpu_allocator.cc | 4 + .../src/local_task_argument_accessor.cc | 11 +- lib/local-execution/src/tracked_allocator.cc | 11 +- lib/pcg/src/strided_rectangle.cc | 36 ----- 68 files changed, 619 insertions(+), 556 deletions(-) delete mode 100644 lib/pcg/src/strided_rectangle.cc diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index 8ccd7c1011..fc91b7d3db 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -30,6 +30,7 @@ target_link_libraries( cudnn nccl utils + pcg ) define_ff_vars(${project_target}) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index b7a6b6a0fe..264ada2ad9 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -5,6 +5,7 @@ #include "device.h" #include "kernels/ff_handle.h" #include "op-attrs/datatype.h" +#include "pcg/device_type.dtg.h" #include "utils/exception.h" #include "utils/required.h" #include "utils/variant.h" @@ -29,20 +30,65 @@ class GenericTensorAccessorW { double *get_double_ptr() const; half *get_half_ptr() const; - GenericTensorAccessorW(DataType dt, - ArrayShape sh, - req p, - bool on_dev = true) - : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {} + GenericTensorAccessorW() = delete; + + GenericTensorAccessorW(DataType data_type, ArrayShape const &shape, void *ptr, DeviceType device_type); + + bool operator==(GenericTensorAccessorW const &) const; + bool operator!=(GenericTensorAccessorW const &) const; + + template + real_type_t
&at(Indices... indices) { + if (this->device_type != DeviceType::CPU) { + throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); + } + if (this->data_type != DT) { + throw mk_runtime_error( + "Invalid access data type ({} != {})", this->data_type, DT); + } + + using T = real_type_t
; + + T *data_ptr = static_cast(this->ptr); + size_t offset = calculate_index_offset({static_cast(indices)...}); + + return data_ptr[offset]; + } + + template + real_type_t
const &at(Indices... indices) const { + if (this->device_type != DeviceType::CPU) { + throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); + } + if (this->data_type != DT) { + throw mk_runtime_error( + "Invalid access data type ({} != {})", this->data_type, DT); + } + + using T = real_type_t
; + + T const *data_ptr = static_cast(this->ptr); + size_t offset = calculate_index_offset({static_cast(indices)...}); + + return data_ptr[offset]; + } public: DataType data_type; ArrayShape shape; - req ptr; - bool on_device; + void *ptr; + DeviceType device_type; + +private: + std::tuple + tie() const; + + size_t calculate_index_offset( + std::initializer_list const &indices) const; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( - GenericTensorAccessorW, data_type, shape, ptr, on_device); std::string format_as(GenericTensorAccessorW const &); std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); @@ -65,20 +111,50 @@ class GenericTensorAccessorR { double const *get_double_ptr() const; half const *get_half_ptr() const; - GenericTensorAccessorR(DataType dt, - ArrayShape sh, - req p, - bool on_dev = true) - : data_type(dt), shape(sh), ptr(p), on_device(on_dev) {} + GenericTensorAccessorR() = delete; + + GenericTensorAccessorR(DataType data_type, + ArrayShape const &shape, + void const *ptr, + DeviceType device_type); + + bool operator==(GenericTensorAccessorR const &) const; + bool operator!=(GenericTensorAccessorR const &) const; + + template + real_type_t
const &at(Indices... indices) const { + if (this->device_type != DeviceType::CPU) { + throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); + } + if (this->data_type != DT) { + throw mk_runtime_error( + "Invalid access data type ({} != {})", this->data_type, DT); + } + + using T = real_type_t
; + + T const *data_ptr = static_cast(this->ptr); + size_t offset = calculate_index_offset({static_cast(indices)...}); + + return data_ptr[offset]; + } public: DataType data_type; ArrayShape shape; - req ptr; - bool on_device; + void const *ptr; + DeviceType device_type; + +private: + std::tuple + tie() const; + + size_t calculate_index_offset( + std::initializer_list const &indices) const; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( - GenericTensorAccessorR, data_type, shape, ptr, on_device); std::string format_as(GenericTensorAccessorR const &); std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h index 452ccc47b0..893be513ea 100644 --- a/lib/kernels/include/kernels/allocation.h +++ b/lib/kernels/include/kernels/allocation.h @@ -5,15 +5,14 @@ #include #include -enum class AllocLocation { HOST, DEVICE }; - namespace FlexFlow { struct IAllocator { virtual void *allocate(size_t) = 0; - virtual void *allocate_and_zero(size_t) = 0; virtual void deallocate(void *) = 0; + virtual DeviceType get_allocation_device_type() const = 0; + virtual ~IAllocator() = default; }; @@ -21,13 +20,12 @@ struct Allocator { Allocator() = delete; GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape); - GenericTensorAccessorW - allocate_tensor_and_zero(TensorShape const &tensor_shape); void *allocate(size_t mem_size); - void *allocate_and_zero(size_t mem_size); void deallocate(void *ptr); + DeviceType get_allocation_device_type() const; + template static typename std::enable_if::value, Allocator>::type @@ -37,8 +35,6 @@ struct Allocator { Allocator(std::shared_ptr ptr) : i_allocator(ptr){}; - AllocLocation alloc_location; - private: std::shared_ptr i_allocator; }; diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h index eb5a1b8198..1e483102dd 100644 --- a/lib/kernels/include/kernels/attention_kernels.h +++ b/lib/kernels/include/kernels/attention_kernels.h @@ -64,8 +64,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState, std::string format_as(MHAPerDeviceState const &x); std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x); -namespace Kernels { -namespace MultiHeadAttention { +namespace Kernels::MultiHeadAttention { MHAPerDeviceState init_kernel(PerDeviceFFHandle const &, Allocator &, @@ -105,8 +104,7 @@ void backward_kernel(ffStream_t stream, void cleanup_kernel(Allocator &allocator, MHAPerDeviceState const &device_state); -} // namespace MultiHeadAttention -} // namespace Kernels +} // namespace Kernels::MultiHeadAttention } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h index bfd72647b0..bde91bea15 100644 --- a/lib/kernels/include/kernels/batch_matmul_kernels.h +++ b/lib/kernels/include/kernels/batch_matmul_kernels.h @@ -5,9 +5,7 @@ #include "kernels/allocation.h" #include "kernels/ff_handle.h" -namespace FlexFlow { -namespace Kernels { -namespace BatchMatmul { +namespace FlexFlow::Kernels::BatchMatmul { void forward_kernel(ffStream_t stream, PerDeviceFFHandle const &handle, @@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream, int k, int batch); -} // namespace BatchMatmul -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::BatchMatmul #endif diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h index 7d533d672c..4de6ac6af0 100644 --- a/lib/kernels/include/kernels/batch_norm_kernels.h +++ b/lib/kernels/include/kernels/batch_norm_kernels.h @@ -43,8 +43,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(BatchNormPerDeviceState, output_w, relu); -namespace Kernels { -namespace BatchNorm { +namespace Kernels::BatchNorm { BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle, Allocator allocator, @@ -81,8 +80,7 @@ void cleanup_kernel(Allocator allocator, bool relu, float *runningMean); -} // namespace BatchNorm -} // namespace Kernels +} // namespace Kernels::BatchNorm } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h index 502a823ca7..f67613cec6 100644 --- a/lib/kernels/include/kernels/cast_kernels.h +++ b/lib/kernels/include/kernels/cast_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Cast { +namespace FlexFlow::Kernels::Cast { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -20,8 +18,6 @@ void backward_kernel(ffStream_t stream, DataType input_type, DataType output_type); -} // namespace Cast -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Cast #endif diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h index cae0c9da8d..959617dcae 100644 --- a/lib/kernels/include/kernels/cast_kernels_cpu.h +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Cast { +namespace FlexFlow::Kernels::Cast { void cpu_forward_kernel(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, @@ -18,8 +16,6 @@ void cpu_backward_kernel(GenericTensorAccessorR const &input, DataType input_type, DataType output_type); -} // namespace Cast -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Cast #endif diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h index eb263e0734..50de18e823 100644 --- a/lib/kernels/include/kernels/combine_kernels.h +++ b/lib/kernels/include/kernels/combine_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Combine { +namespace FlexFlow::Kernels::Combine { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad); -} // namespace Combine -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Combine #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h index 66c22ddbf8..430c7cf906 100644 --- a/lib/kernels/include/kernels/combine_kernels_cpu.h +++ b/lib/kernels/include/kernels/combine_kernels_cpu.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Combine { +namespace FlexFlow::Kernels::Combine { void cpu_forward_kernel(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); @@ -14,8 +12,6 @@ void cpu_forward_kernel(GenericTensorAccessorR const &input, void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad); -} // namespace Combine -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Combine #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h index a44affc1f2..33355296dd 100644 --- a/lib/kernels/include/kernels/concat_kernels.h +++ b/lib/kernels/include/kernels/concat_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Concat { +namespace FlexFlow::Kernels::Concat { void forward_kernel(ffStream_t stream, GenericTensorAccessorW const &output, @@ -18,8 +16,6 @@ void backward_kernel(ffStream_t stream, std::vector const &input_grads, ff_dim_t axis); -} // namespace Concat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Concat #endif diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index cfc64f963d..217751e191 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -34,8 +34,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState, bwdFilterAlgo, bwdDataAlgo); -namespace Kernels { -namespace Conv2D { +namespace Kernels::Conv2D { Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, std::optional activation, @@ -70,8 +69,7 @@ void backward_kernel(ffStream_t stream, float *bias_grad_ptr, std::optional activation); -} // namespace Conv2D -} // namespace Kernels +} // namespace Kernels::Conv2D } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h index e6ab9fa8cc..cda38c2e9e 100644 --- a/lib/kernels/include/kernels/datatype_dispatch.h +++ b/lib/kernels/include/kernels/datatype_dispatch.h @@ -1,7 +1,8 @@ #ifndef _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H #define _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H -#include "accessor.h" +#include "op-attrs/datatype.h" +#include "utils/exception.h" namespace FlexFlow { diff --git a/lib/kernels/include/kernels/dropout_kernels.h b/lib/kernels/include/kernels/dropout_kernels.h index c0e503be5b..4790540098 100644 --- a/lib/kernels/include/kernels/dropout_kernels.h +++ b/lib/kernels/include/kernels/dropout_kernels.h @@ -31,8 +31,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(DropoutPerDeviceState, reserveSpaceSize, dropoutStateSize); -namespace Kernels { -namespace Dropout { +namespace Kernels::Dropout { DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle, float rate, @@ -56,8 +55,7 @@ void cleanup_kernel(Allocator allocator, ffDropoutDescriptor_t dropoutDesc, void *dropoutStates); -} // namespace Dropout -} // namespace Kernels +} // namespace Kernels::Dropout } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H diff --git a/lib/kernels/include/kernels/element_binary_kernels.h b/lib/kernels/include/kernels/element_binary_kernels.h index 41447e98e6..1017230fb0 100644 --- a/lib/kernels/include/kernels/element_binary_kernels.h +++ b/lib/kernels/include/kernels/element_binary_kernels.h @@ -26,8 +26,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ElementBinaryPerDeviceState, opDesc, reduceAddDesc); -namespace Kernels { -namespace ElementBinary { +namespace Kernels::ElementBinary { ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle, OperatorType op_type, @@ -58,8 +57,7 @@ void backward_kernel(ffStream_t stream, bool broadcast_inputRHS, PerDeviceFFHandle handle); -} // namespace ElementBinary -} // namespace Kernels +} // namespace Kernels::ElementBinary } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h index 8c6864b2d9..26ce4ecaec 100644 --- a/lib/kernels/include/kernels/element_unary_kernels.h +++ b/lib/kernels/include/kernels/element_unary_kernels.h @@ -19,8 +19,7 @@ FF_VISITABLE_STRUCT_NO_EQ(ElementUnaryPerDeviceState, outputTensor, actiDesc); -namespace Kernels { -namespace ElementUnary { +namespace Kernels::ElementUnary { ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape, ArrayShape const &output_shape, @@ -42,8 +41,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &output, GenericTensorAccessorR const &output_grad); -} // namespace ElementUnary -} // namespace Kernels +} // namespace Kernels::ElementUnary } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h index 06582ca1d5..6d5141f489 100644 --- a/lib/kernels/include/kernels/embedding_kernels.h +++ b/lib/kernels/include/kernels/embedding_kernels.h @@ -5,9 +5,7 @@ #include "kernels/accessor.h" #include "op-attrs/ops/embedding.h" -namespace FlexFlow { -namespace Kernels { -namespace Embedding { +namespace FlexFlow::Kernels::Embedding { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, @@ -35,8 +33,6 @@ void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p); template __global__ void rand_generate_int(TD *ptr, size_t size, TD p); -} // namespace Embedding -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Embedding #endif // _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h index 3e600c48de..41b411c937 100644 --- a/lib/kernels/include/kernels/flat_kernels.h +++ b/lib/kernels/include/kernels/flat_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Flat { +namespace FlexFlow::Kernels::Flat { void forward_kernel(ffStream_t stream, GenericTensorAccessorR input, @@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream, float *input_grad_ptr, float const *output_grad_ptr); -} // namespace Flat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Flat #endif // _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h index 13bf4b898a..af2da3b11f 100644 --- a/lib/kernels/include/kernels/gather_kernels.h +++ b/lib/kernels/include/kernels/gather_kernels.h @@ -15,8 +15,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState, handle, legion_dim); -namespace Kernels { -namespace Gather { +namespace Kernels::Gather { void forward_kernel(ffStream_t stream, GatherPerDeviceState const &m, @@ -30,8 +29,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &index, GenericTensorAccessorW const &input_grad); -} // namespace Gather -} // namespace Kernels +} // namespace Kernels::Gather } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h index be13d32879..a6ae87442a 100644 --- a/lib/kernels/include/kernels/layer_norm_kernels.h +++ b/lib/kernels/include/kernels/layer_norm_kernels.h @@ -30,8 +30,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState, bias, data_type); -namespace Kernels { -namespace LayerNorm { +namespace Kernels::LayerNorm { // todo: this may have some problem. LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle, @@ -57,8 +56,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorW const &gamma_grad, GenericTensorAccessorW const &beta_grad); -} // namespace LayerNorm -} // namespace Kernels +} // namespace Kernels::LayerNorm } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h index c761eaf1d9..689a77dd2f 100644 --- a/lib/kernels/include/kernels/linear_kernels.h +++ b/lib/kernels/include/kernels/linear_kernels.h @@ -33,8 +33,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LinearPerDeviceState, weight_type, output_type); -namespace Kernels { -namespace Linear { +namespace Kernels::Linear { LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, float *one_ptr, @@ -72,8 +71,7 @@ void backward_kernel(ffStream_t stream, int out_dim, int batch_size); -} // namespace Linear -} // namespace Kernels +} // namespace Kernels::Linear } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h index 121ed184e9..cf6cfe35d1 100644 --- a/lib/kernels/include/kernels/local_cpu_allocator.h +++ b/lib/kernels/include/kernels/local_cpu_allocator.h @@ -7,14 +7,15 @@ struct LocalCPUAllocator : public IAllocator { LocalCPUAllocator() = default; LocalCPUAllocator(LocalCPUAllocator const &) = delete; LocalCPUAllocator(LocalCPUAllocator &&) = delete; - ~LocalCPUAllocator() override; + ~LocalCPUAllocator() = default; void *allocate(size_t) override; - void *allocate_and_zero(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: - std::unordered_set ptrs; + std::unordered_map> ptrs; }; CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator); diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h index 16f60daead..b8e0540974 100644 --- a/lib/kernels/include/kernels/local_cuda_allocator.h +++ b/lib/kernels/include/kernels/local_cuda_allocator.h @@ -10,9 +10,10 @@ struct LocalCudaAllocator : public IAllocator { ~LocalCudaAllocator() override; void *allocate(size_t) override; - void *allocate_and_zero(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: std::unordered_set ptrs; }; diff --git a/lib/kernels/include/kernels/nccl.h b/lib/kernels/include/kernels/nccl.h index b8a6784676..042911d172 100644 --- a/lib/kernels/include/kernels/nccl.h +++ b/lib/kernels/include/kernels/nccl.h @@ -23,15 +23,11 @@ struct ncclUniqueId {}; struct ncclComm_t {}; #endif -namespace FlexFlow { -namespace Kernels { -namespace NCCL { +namespace FlexFlow::Kernels::NCCL { ncclUniqueId generate_unique_id(); ncclComm_t create_comm(ncclUniqueId const &, int num_ranks, int my_rank); -} // namespace NCCL -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::NCCL #endif diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h index 64ef1a1352..e580c4a9de 100644 --- a/lib/kernels/include/kernels/partition_kernels.h +++ b/lib/kernels/include/kernels/partition_kernels.h @@ -13,8 +13,7 @@ struct RepartitionPerDeviceState { FF_VISITABLE_STRUCT_NO_EQ(RepartitionPerDeviceState, handle, data_type); -namespace Kernels { -namespace Repartition { +namespace Kernels::Repartition { RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle, DataType data_type); @@ -29,8 +28,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorW const &output_grad, GenericTensorAccessorR const &input_grad); -} // namespace Repartition -} // namespace Kernels +} // namespace Kernels::Repartition } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h index 798c0507f8..191c23bc98 100644 --- a/lib/kernels/include/kernels/pool_2d_kernels.h +++ b/lib/kernels/include/kernels/pool_2d_kernels.h @@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Pool2DPerDeviceState, poolDesc, relu); -namespace Kernels { -namespace Pool2D { +namespace Kernels::Pool2D { Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle, std::optional activation, @@ -75,8 +74,7 @@ void backward_kernel(ffStream_t stream, void const *output_ptr, void const *output_grad_ptr); -} // namespace Pool2D -} // namespace Kernels +} // namespace Kernels::Pool2D } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h index 4287472875..cd3930ea1c 100644 --- a/lib/kernels/include/kernels/reduce_kernels.h +++ b/lib/kernels/include/kernels/reduce_kernels.h @@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT(ReducePerDeviceState, op_type, reduction_size); -namespace Kernels { -namespace Reduce { +namespace Kernels::Reduce { ReducePerDeviceState init_kernel(PerDeviceFFHandle const &, OperatorType const &, @@ -43,8 +42,7 @@ void backward_kernel(ffStream_t stream, ReducePerDeviceState const &m, float const *output_grad_ptr, float *input_grad_ptr); -} // namespace Reduce -} // namespace Kernels +} // namespace Kernels::Reduce } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h index fb3baf215c..7e1e240ea4 100644 --- a/lib/kernels/include/kernels/reduction_kernels.h +++ b/lib/kernels/include/kernels/reduction_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Reduction { +namespace FlexFlow::Kernels::Reduction { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -17,8 +15,6 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output); -} // namespace Reduction -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reduction #endif // _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h index 409fc81f44..877eeabf04 100644 --- a/lib/kernels/include/kernels/replicate_kernels.h +++ b/lib/kernels/include/kernels/replicate_kernels.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Replicate { +namespace FlexFlow::Kernels::Replicate { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -17,8 +15,6 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &output, size_t num_replicas); -} // namespace Replicate -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Replicate #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h index 11d2f1bf5c..a72b799875 100644 --- a/lib/kernels/include/kernels/replicate_kernels_cpu.h +++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h @@ -4,9 +4,7 @@ #include "device.h" #include "kernels/accessor.h" -namespace FlexFlow { -namespace Kernels { -namespace Replicate { +namespace FlexFlow::Kernels::Replicate { void cpu_forward_kernel(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); @@ -15,8 +13,6 @@ void cpu_backward_kernel(GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, size_t num_replicas); -} // namespace Replicate -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Replicate #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index a83caa6bea..5fa4382c43 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -13,8 +13,7 @@ struct ReshapePerDeviceState { FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type); -namespace Kernels { -namespace Reshape { +namespace Kernels::Reshape { ReshapePerDeviceState init_kernel(DataType data_type); @@ -28,8 +27,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output); -} // namespace Reshape -} // namespace Kernels +} // namespace Kernels::Reshape } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H diff --git a/lib/kernels/include/kernels/reverse_kernels.h b/lib/kernels/include/kernels/reverse_kernels.h index 42a83ae219..deb5b22155 100644 --- a/lib/kernels/include/kernels/reverse_kernels.h +++ b/lib/kernels/include/kernels/reverse_kernels.h @@ -3,9 +3,7 @@ #include "device.h" -namespace FlexFlow { -namespace Kernels { -namespace Reverse { +namespace FlexFlow::Kernels::Reverse { void forward_kernel(ffStream_t stream, float const *in_ptr, @@ -23,8 +21,6 @@ void backward_kernel(ffStream_t stream, coord_t in_blk_size, coord_t input_size); -} // namespace Reverse -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reverse #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h index bb17aa9400..b0edaa264c 100644 --- a/lib/kernels/include/kernels/reverse_kernels_cpu.h +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -1,27 +1,22 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H +#include "accessor.h" #include "device.h" -namespace FlexFlow { -namespace Kernels { -namespace Reverse { +namespace FlexFlow::Kernels::Reverse { -void cpu_forward_kernel(float const *in_ptr, - float *out_ptr, +void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, coord_t num_out_blks, coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size); + coord_t in_blk_size); -void cpu_backward_kernel(float const *out_grad_ptr, - float *in_grad_ptr, +void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor, coord_t num_out_blks, coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t input_size); -} // namespace Reverse -} // namespace Kernels -} // namespace FlexFlow + coord_t in_blk_size); +} // namespace FlexFlow::Kernels::Reverse #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h index 061230ec52..93135cb648 100644 --- a/lib/kernels/include/kernels/softmax_kernels.h +++ b/lib/kernels/include/kernels/softmax_kernels.h @@ -15,8 +15,7 @@ struct SoftmaxPerDeviceState { FF_VISITABLE_STRUCT(SoftmaxPerDeviceState, handle, inputTensor, dim); -namespace Kernels { -namespace Softmax { +namespace Kernels::Softmax { SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle, int dim, @@ -35,8 +34,7 @@ void backward_kernel(ffStream_t stream, float const *output_grad_ptr, size_t num_elements); -} // namespace Softmax -} // namespace Kernels +} // namespace Kernels::Softmax } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/split_kernels.h b/lib/kernels/include/kernels/split_kernels.h index 36434d4be8..538b9602c2 100644 --- a/lib/kernels/include/kernels/split_kernels.h +++ b/lib/kernels/include/kernels/split_kernels.h @@ -3,10 +3,7 @@ #include "device.h" -namespace FlexFlow { - -namespace Kernels { -namespace Split { +namespace FlexFlow::Kernels::Split { void forward_kernel(ffStream_t stream, float **out_ptrs, float const *in_ptr, @@ -22,8 +19,6 @@ void backward_kernel(ffStream_t stream, coord_t num_blks, int numOutputs); -} // namespace Split -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Split #endif // _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H diff --git a/lib/kernels/include/kernels/topk_kernels.h b/lib/kernels/include/kernels/topk_kernels.h index ae1c739f6c..6f33381e1a 100644 --- a/lib/kernels/include/kernels/topk_kernels.h +++ b/lib/kernels/include/kernels/topk_kernels.h @@ -12,8 +12,7 @@ struct TopKPerDeviceState { FF_VISITABLE_STRUCT(TopKPerDeviceState, sorted); -namespace Kernels { -namespace TopK { +namespace Kernels::TopK { TopKPerDeviceState init_kernel(bool sorted); @@ -35,8 +34,7 @@ void backward_kernel(ffStream_t stream, int length, int k); -} // namespace TopK -} // namespace Kernels +} // namespace Kernels::TopK } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h index 56da81ba2b..b48b7e0aa8 100644 --- a/lib/kernels/include/kernels/transpose_kernels.h +++ b/lib/kernels/include/kernels/transpose_kernels.h @@ -16,8 +16,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(TransposePerDeviceState, num_dim, perm); -namespace Kernels { -namespace Transpose { +namespace Kernels::Transpose { TransposePerDeviceState init_kernel(int num_dim, std::vector const &perm); @@ -32,8 +31,7 @@ void backward_kernel(cudaStream_t stream, GenericTensorAccessorW const &in_grad, GenericTensorAccessorR const &out_grad); -} // namespace Transpose -} // namespace Kernels +} // namespace Kernels::Transpose } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index 66d3c02300..c0b11a2299 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -2,6 +2,64 @@ namespace FlexFlow { +GenericTensorAccessorW::GenericTensorAccessorW( + DataType data_type, + ArrayShape const &shape, + void *ptr, + DeviceType device_type = DeviceType::GPU) + : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + +std::tuple + GenericTensorAccessorW::tie() const { + return std::tie(this->data_type, this->shape, this->ptr, this->device_type); +} + +size_t GenericTensorAccessorW::calculate_index_offset( + std::initializer_list const &indices) const { + + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error( + "Number of indices ({}) does not match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims()); + } + + size_t offset = 0; + size_t multiplier = 1; + size_t cur_idx; + auto it = indices.end() - 1; + + for (std::size_t i = this->shape.num_dims(); i-- > 0;) { + cur_idx = *it--; + + if (cur_idx >= this->shape[legion_dim_t(i)]) { + throw mk_runtime_error("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + cur_idx, + this->shape[legion_dim_t(i)]); + } + + offset += cur_idx * multiplier; + multiplier *= this->shape[legion_dim_t(i)]; + } + + return offset; +} + +bool GenericTensorAccessorW::operator==( + GenericTensorAccessorW const &other) const { + return this->tie() == other.tie(); +} + +bool GenericTensorAccessorW::operator!=( + GenericTensorAccessorW const &other) const { + return this->tie() != other.tie(); +} + int32_t *GenericTensorAccessorW::get_int32_ptr() const { return this->get(); } @@ -33,6 +91,64 @@ std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) { return (s << fmt::to_string(a)); } +GenericTensorAccessorR::GenericTensorAccessorR( + DataType data_type, + ArrayShape const &shape, + void const *ptr, + DeviceType device_type = DeviceType::GPU) + : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + +std::tuple + GenericTensorAccessorR::tie() const { + return std::tie(this->data_type, this->shape, this->ptr, this->device_type); +} + +size_t GenericTensorAccessorR::calculate_index_offset( + std::initializer_list const &indices) const { + + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error( + "Number of indices ({}) does not match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims()); + } + + size_t offset = 0; + size_t multiplier = 1; + size_t cur_idx; + auto it = indices.end() - 1; + + for (std::size_t i = this->shape.num_dims(); i-- > 0;) { + cur_idx = *it--; + + if (cur_idx >= this->shape[legion_dim_t(i)]) { + throw mk_runtime_error("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + cur_idx, + this->shape[legion_dim_t(i)]); + } + + offset += cur_idx * multiplier; + multiplier *= this->shape[legion_dim_t(i)]; + } + + return offset; +} + +bool GenericTensorAccessorR::operator==( + GenericTensorAccessorR const &other) const { + return this->tie() == other.tie(); +} + +bool GenericTensorAccessorR::operator!=( + GenericTensorAccessorR const &other) const { + return this->tie() != other.tie(); +} + int32_t const *GenericTensorAccessorR::get_int32_ptr() const { return this->get(); } @@ -159,7 +275,7 @@ GenericTensorAccessorR read_only_accessor_from_write_accessor( return GenericTensorAccessorR{writable.data_type, writable.shape, req(writable.ptr), - writable.on_device}; + writable.device_type}; } bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index ce06fbabe0..751cdc0ebb 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -7,26 +7,19 @@ void *Allocator::allocate(size_t mem_size) { return this->i_allocator->allocate(mem_size); } -void *Allocator::allocate_and_zero(size_t mem_size) { - return this->i_allocator->allocate_and_zero(mem_size); -} - void Allocator::deallocate(void *ptr) { this->i_allocator->deallocate(ptr); } -GenericTensorAccessorW - Allocator::allocate_tensor(TensorShape const &tensor_shape) { - void *ptr = this->allocate(get_size_in_bytes(tensor_shape)); - bool on_device = this->alloc_location == AllocLocation::DEVICE; - return {tensor_shape.data_type, tensor_shape, ptr, on_device}; +DeviceType Allocator::get_allocation_device_type() const { + return this->i_allocator->get_allocation_device_type(); } GenericTensorAccessorW - Allocator::allocate_tensor_and_zero(TensorShape const &tensor_shape) { - void *ptr = this->allocate_and_zero(get_size_in_bytes(tensor_shape)); - bool on_device = this->alloc_location == AllocLocation::DEVICE; - return {tensor_shape.data_type, tensor_shape, ptr, on_device}; + Allocator::allocate_tensor(TensorShape const &tensor_shape) { + void *ptr = this->allocate(get_size_in_bytes(tensor_shape)); + return { + tensor_shape.data_type, tensor_shape, ptr, get_allocation_device_type()}; } } // namespace FlexFlow diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index d5e2f1167d..5c18a9ab5a 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -53,6 +53,7 @@ std::size_t ArrayShape::at(ff_dim_t idx) const { ArrayShape ArrayShape::sub_shape( std::optional> start, std::optional> end) const { + NOT_IMPLEMENTED(); } diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc index 5888d9a96a..2d3f440c75 100644 --- a/lib/kernels/src/cpu/cast_kernels.cc +++ b/lib/kernels/src/cpu/cast_kernels.cc @@ -1,9 +1,7 @@ #include "kernels/cast_kernels_cpu.h" #include "kernels/datatype_dispatch.h" -namespace FlexFlow { -namespace Kernels { -namespace Cast { +namespace FlexFlow::Kernels::Cast { template void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) { @@ -53,6 +51,4 @@ void cpu_backward_kernel(GenericTensorAccessorR const &input, input_type, output_type, input, output); } -} // namespace Cast -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/src/cpu/combine_kernels.cc b/lib/kernels/src/cpu/combine_kernels.cc index e48f4c3e01..d0be1f9f2d 100644 --- a/lib/kernels/src/cpu/combine_kernels.cc +++ b/lib/kernels/src/cpu/combine_kernels.cc @@ -1,9 +1,7 @@ #include "kernels/combine_kernels_cpu.h" #include "kernels/datatype_dispatch.h" -namespace FlexFlow { -namespace Kernels { -namespace Combine { +namespace FlexFlow::Kernels::Combine { template struct CPUForwardKernel { @@ -37,6 +35,4 @@ void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, input_grad.data_type, output_grad, input_grad); } -} // namespace Combine -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Combine diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index 239baf4041..5853869047 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -1,26 +1,22 @@ #include "kernels/datatype_dispatch.h" #include "kernels/replicate_kernels_cpu.h" -namespace FlexFlow { -namespace Kernels { -namespace Replicate { +namespace FlexFlow::Kernels::Replicate { template void cpu_replicate_backward_kernel(T *input, T const *output, size_t num_elements, size_t num_replicas) { - for (size_t i = 0; i < num_elements; ++i) { + for (size_t i = 0; i < num_elements; i++) { T sum = 0; - for (size_t j = 0; j < num_replicas; ++j) { + for (size_t j = 0; j < num_replicas; j++) { sum += output[i + j * num_elements]; } input[i] = sum; } } -// Why does replicate forward seem to only transfer memory? Shouldn't it also -// handle the replication? template struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, @@ -36,9 +32,10 @@ struct CPUBackwardKernel { void operator()(GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, size_t num_replicas) { - size_t total_elements = input.shape.num_elements() * num_replicas; - cpu_replicate_backward_kernel( - input.get(), output.get(), total_elements, num_replicas); + cpu_replicate_backward_kernel(input.get(), + output.get(), + input.shape.num_elements(), + num_replicas); } }; @@ -54,6 +51,4 @@ void cpu_backward_kernel(GenericTensorAccessorW const &input, input.data_type, input, output, num_replicas); } -} // namespace Replicate -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index 350dad03e9..1971435d8c 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -2,77 +2,66 @@ #include #include -namespace FlexFlow { -namespace Kernels { -namespace Reverse { +namespace FlexFlow::Kernels::Reverse { -void cpu_reverse_forward_kernel(float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size) { - coord_t total_elements = num_out_blks * reverse_dim_size * in_blk_size; +template +struct CPUReverseForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size) { + assert(input.data_type == DT && output.data_type == DT); - std::vector> in_blocks(num_out_blks * reverse_dim_size, - std::vector(in_blk_size)); - - // For each output block, copy the input block into in_blocks - for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { - // Each output block has reverse_dim_size input blocks - for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) { - coord_t start_idx = (blk_idx * reverse_dim_size + rev_idx) * in_blk_size; - - // Copy elements from in_ptr to the current block in in_blocks - std::vector ¤t_block = - in_blocks[blk_idx * reverse_dim_size + rev_idx]; - for (coord_t i = 0; i < in_blk_size; ++i) { - current_block[i] = in_ptr[start_idx + i]; + // For each output block, copy the input block + for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { + for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) { + for (coord_t i = 0; i < in_blk_size; ++i) { + output.at
(blk_idx, rev_idx, i) = + input.at
(blk_idx, rev_idx, i); + } } } - } - // Reverse the in_blocks within each output block - for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { - auto block_start = in_blocks.begin() + blk_idx * reverse_dim_size; - auto block_end = block_start + reverse_dim_size; - std::reverse(block_start, block_end); - } - - // Copy the reversed blocks to the output array - for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { - for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) { - coord_t start_idx = (blk_idx * reverse_dim_size + rev_idx) * in_blk_size; + // Reverse the blocks within each output block + for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { + for (coord_t rev_idx = 0; rev_idx < reverse_dim_size / 2; ++rev_idx) { + coord_t start_idx = rev_idx; + coord_t end_idx = reverse_dim_size - 1 - rev_idx; - // Copy elements from the current block in in_blocks to out_ptr - std::vector const ¤t_block = - in_blocks[blk_idx * reverse_dim_size + rev_idx]; - for (coord_t i = 0; i < in_blk_size; ++i) { - out_ptr[start_idx + i] = current_block[i]; + for (coord_t i = 0; i < in_blk_size; ++i) { + std::swap(output.at
(blk_idx, start_idx, i), + output.at
(blk_idx, end_idx, i)); + } } } } -} +}; -void cpu_forward_kernel(float const *in_ptr, - float *out_ptr, +void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, coord_t num_out_blks, coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size) { - cpu_reverse_forward_kernel( - in_ptr, out_ptr, num_out_blks, reverse_dim_size, in_blk_size); + coord_t in_blk_size) { + DataTypeDispatch1{}(input_accessor.data_type, + input_accessor, + std::ref(output_accessor), + num_out_blks, + reverse_dim_size, + in_blk_size); } -void cpu_backward_kernel(float const *out_grad_ptr, - float *in_grad_ptr, +void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor, coord_t num_out_blks, coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t input_size) { - cpu_reverse_forward_kernel( - out_grad_ptr, in_grad_ptr, num_out_blks, reverse_dim_size, in_blk_size); + coord_t in_blk_size) { + DataTypeDispatch1{}(output_accessor.data_type, + output_accessor, + std::ref(input_accessor), + num_out_blks, + reverse_dim_size, + in_blk_size); } -} // namespace Reverse -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu index 68004738d2..ad216feda2 100644 --- a/lib/kernels/src/cuda/ops/concat_kernels.cu +++ b/lib/kernels/src/cuda/ops/concat_kernels.cu @@ -17,9 +17,7 @@ #include "kernels/concat_kernels.h" #include -namespace FlexFlow { -namespace Kernels { -namespace Concat { +namespace FlexFlow::Kernels::Concat { void calc_blk_size(size_t &num_blocks, size_t &blk_size, @@ -87,6 +85,4 @@ void backward_kernel(cudaStream_t stream, } } -} // namespace Concat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Concat diff --git a/lib/kernels/src/local_cpu_allocator.cc b/lib/kernels/src/local_cpu_allocator.cc index ced707edcc..5cf337c685 100644 --- a/lib/kernels/src/local_cpu_allocator.cc +++ b/lib/kernels/src/local_cpu_allocator.cc @@ -1,34 +1,16 @@ #include "kernels/local_cpu_allocator.h" #include "kernels/device.h" +#include "utils/containers/contains_key.h" namespace FlexFlow { void *LocalCPUAllocator::allocate(size_t requested_memory_size) { void *ptr = malloc(requested_memory_size); - - if (ptr != nullptr) { - this->ptrs.insert(ptr); - } else { - throw std::bad_alloc(); - } - - return ptr; -} - -void *LocalCPUAllocator::allocate_and_zero(size_t requested_memory_size) { - void *ptr = calloc(1, requested_memory_size); - - if (ptr != nullptr) { - this->ptrs.insert(ptr); - } else { - throw std::bad_alloc(); - } - + this->ptrs.insert({ptr, std::unique_ptr(ptr, free)}); return ptr; } void LocalCPUAllocator::deallocate(void *ptr) { - if (contains(this->ptrs, ptr)) { - free(ptr); + if (contains_key(this->ptrs, ptr)) { this->ptrs.erase(ptr); } else { throw std::runtime_error( @@ -36,15 +18,12 @@ void LocalCPUAllocator::deallocate(void *ptr) { } } -LocalCPUAllocator::~LocalCPUAllocator() { - for (void *ptr : this->ptrs) { - free(ptr); - } +DeviceType LocalCPUAllocator::get_allocation_device_type() const { + return DeviceType::CPU; } Allocator create_local_cpu_memory_allocator() { Allocator allocator = Allocator::create(); - allocator.alloc_location = AllocLocation::HOST; return allocator; } diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/local_cuda_allocator.cc index b6c615a5ca..416768a479 100644 --- a/lib/kernels/src/local_cuda_allocator.cc +++ b/lib/kernels/src/local_cuda_allocator.cc @@ -10,14 +10,6 @@ void *LocalCudaAllocator::allocate(size_t requested_memory_size) { return ptr; } -void *LocalCudaAllocator::allocate_and_zero(size_t requested_memory_size) { - void *ptr; - checkCUDA(cudaMalloc(&ptr, requested_memory_size)); - checkCUDA(cudaMemset(ptr, 0, requested_memory_size)); - this->ptrs.insert(ptr); - return ptr; -} - void LocalCudaAllocator::deallocate(void *ptr) { if (contains(this->ptrs, ptr)) { checkCUDA(cudaFree(ptr)); @@ -28,6 +20,10 @@ void LocalCudaAllocator::deallocate(void *ptr) { } } +DeviceType LocalCudaAllocator::get_allocation_device_type() const { + return DeviceType::GPU; +} + LocalCudaAllocator::~LocalCudaAllocator() { for (void *ptr : this->ptrs) { checkCUDA(cudaFree(ptr)); @@ -36,7 +32,6 @@ LocalCudaAllocator::~LocalCudaAllocator() { Allocator create_local_cuda_memory_allocator() { Allocator allocator = Allocator::create(); - allocator.alloc_location = AllocLocation::DEVICE; return allocator; } diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt index 007740b510..981f87b3d8 100644 --- a/lib/kernels/test/CMakeLists.txt +++ b/lib/kernels/test/CMakeLists.txt @@ -14,4 +14,5 @@ ff_add_test_executable( cudnn cudart cublas + pcg ) diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 8ef2e9cccd..b77e743b62 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -1,7 +1,7 @@ #include "doctest/doctest.h" #include "kernels/cast_kernels.h" +#include "kernels/cast_kernels_cpu.h" #include "test_utils.h" -#include using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { @@ -40,6 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) { allocator); GenericTensorAccessorW grad_input_accessor = allocator.allocate_tensor(input_shape); + fill_with_zeros(grad_input_accessor); Kernels::Cast::backward_kernel(managed_stream.raw_stream(), grad_output_accessor, @@ -71,44 +72,34 @@ TEST_SUITE(FF_TEST_SUITE) { // Only calling forward kernel as backward kernel is exactly the same SUBCASE("forward_kernel") { - auto transform = [start_val = 1.1f, - counter = 0.0f](float input) mutable -> float { - return start_val + counter++; - }; - // Run GPU Forward Kernel GenericTensorAccessorW input_accessor_gpu = - create_transformed_accessor_w( - input_shape, gpu_allocator, transform); + create_random_filled_accessor_w(input_shape, + gpu_allocator); Kernels::Cast::forward_kernel( managed_stream.raw_stream(), read_only_accessor_from_write_accessor(input_accessor_gpu), output_accessor_gpu, DataType::FLOAT, DataType::INT32); - std::cout << "Before GPU load" << std::endl; + std::vector result_data_gpu = load_accessor_data(output_accessor_gpu); // Run CPU Forward Kernel GenericTensorAccessorW input_accessor_cpu = - create_transformed_accessor_w( - input_shape, cpu_allocator, transform); + create_random_filled_accessor_w(input_shape, + cpu_allocator); Kernels::Cast::cpu_forward_kernel( read_only_accessor_from_write_accessor(input_accessor_cpu), output_accessor_cpu, DataType::FLOAT, DataType::INT32); - std::cout << "Before CPU load" << std::endl; - if (output_accessor_cpu.on_device) { - std::cout << "CPU data is on device" << std::endl; - } else { - std::cout << "CPU data is on host" << std::endl; - } + std::vector result_data_cpu = load_accessor_data(output_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } } } diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index aeceb1ef4d..8999a45b06 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -72,20 +72,18 @@ TEST_SUITE(FF_TEST_SUITE) { load_accessor_data(output_accessor_gpu); // Run CPU Combine Forward Kernel - GenericTensorAccessorW input_accessor_cpu = - copy_tensor_between_memories(input_accessor_gpu, - cpu_allocator); + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); GenericTensorAccessorW output_accessor_cpu = cpu_allocator.allocate_tensor(output_shape); - Kernels::Combine::cpu_forward_kernel( - read_only_accessor_from_write_accessor(input_accessor_cpu), - output_accessor_cpu); + Kernels::Combine::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); std::vector result_data_cpu = load_accessor_data(output_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } SUBCASE("backward_kernel") { @@ -94,7 +92,8 @@ TEST_SUITE(FF_TEST_SUITE) { create_random_filled_accessor_r(output_shape, gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor_and_zero(input_shape); + gpu_allocator.allocate_tensor(input_shape); + fill_with_zeros(input_grad_accessor_gpu); Kernels::Combine::backward_kernel(managed_stream.raw_stream(), output_grad_accessor_gpu, @@ -104,20 +103,19 @@ TEST_SUITE(FF_TEST_SUITE) { load_accessor_data(input_grad_accessor_gpu); // Run CPU Combine Backward Kernel - GenericTensorAccessorW output_grad_accessor_cpu = - copy_tensor_between_memories( - output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor_and_zero(input_shape); + cpu_allocator.allocate_tensor(input_shape); + fill_with_zeros(input_grad_accessor_cpu); - Kernels::Combine::cpu_backward_kernel( - read_only_accessor_from_write_accessor(output_grad_accessor_cpu), - input_grad_accessor_cpu); + Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu, + input_grad_accessor_cpu); std::vector result_data_cpu = load_accessor_data(input_grad_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } } } diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 8754381850..a2becc3a54 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -1,13 +1,14 @@ #include "doctest/doctest.h" #include "kernels/concat_kernels.h" #include "test_utils.h" +#include "utils/containers/repeat.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test concat kernel forward and backward") { - size_t num_inputs = 3; - size_t size_per_input = 100; - ff_dim_t concat_axis = ff_dim_t(0); + size_t num_inputs = 2; + size_t size_per_input = 10; + ff_dim_t concat_axis = ff_dim_t(1); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -15,7 +16,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = make_tensor_shape_from_legion_dims({size_per_input}, DataType::FLOAT); TensorShape output_shape = make_tensor_shape_from_legion_dims( - {size_per_input, num_inputs}, DataType::FLOAT); + {num_inputs, size_per_input}, DataType::FLOAT); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 8237e61729..f306e2fcbe 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -1,7 +1,7 @@ #include "doctest/doctest.h" #include "kernels/dropout_kernels.h" #include "test_utils.h" -#include "utils/containers.h" +#include "utils/containers/count.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 5c88110fde..c4d6aa666a 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector expected_output_data( input_accessor.shape.num_elements(), 2.0f); - CHECK(check_output_data == expected_output_data); + CHECK(vectors_are_approx_equal(check_output_data, expected_output_data)); } SUBCASE("backward_kernel") { @@ -50,7 +50,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector expected_output_data( input_accessor.shape.num_elements(), 1.0f); - CHECK(backward_output_data == expected_output_data); + CHECK( + vectors_are_approx_equal(backward_output_data, expected_output_data)); } } } diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index d34101d349..21c970e175 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -33,7 +33,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector expected_output_data( input_accessor.shape.num_elements(), 1.0f); - CHECK(check_output_data == expected_output_data); + + CHECK(vectors_are_approx_equal(check_output_data, expected_output_data)); } SUBCASE("backward_kernel") { @@ -53,7 +54,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector expected_grad_input_data( input_grad_accessor.shape.num_elements(), 3.0f); - CHECK(host_grad_input_data == expected_grad_input_data); + CHECK(vectors_are_approx_equal(host_grad_input_data, + expected_grad_input_data)); } } } diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 989ffde163..999044beb8 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -52,7 +52,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor.shape.num_elements(), 1.0f); std::vector host_grad_data = load_accessor_data(input_grad_accessor); - CHECK(host_grad_data == expected_grad_input_data); + CHECK(vectors_are_approx_equal(host_grad_data, expected_grad_input_data)); } } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 315a1c3489..dae69c0262 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -33,7 +33,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector expected_output_data( input_accessor.shape.num_elements(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(vectors_are_approx_equal(check_output_data, expected_output_data)); } SUBCASE("backward_kernel") { @@ -55,17 +55,12 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("Check Replicate Forward Kernel against CPU Kernel") { - std::size_t num_replicas = 10; + std::size_t num_replicas = 2; - // This should be like three shapes: pre_replication, replication shape, and - // reduced shape, but things are weird cause doesn't seem to be replicating - // anything (ie. input shape should be same as reduced shape) TensorShape input_shape = - make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT); - TensorShape replicated_shape = - make_tensor_shape_from_legion_dims({10, num_replicas}, DataType::FLOAT); - TensorShape reduced_shape = - make_tensor_shape_from_legion_dims({10}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({5}, DataType::FLOAT); + TensorShape output_shape = + make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT); ManagedPerDeviceFFHandle managed_handle{}; ManagedFFStream managed_stream{}; @@ -79,7 +74,8 @@ TEST_SUITE(FF_TEST_SUITE) { create_random_filled_accessor_r(input_shape, gpu_allocator); GenericTensorAccessorW output_accessor_gpu = - gpu_allocator.allocate_tensor(replicated_shape); + gpu_allocator.allocate_tensor(output_shape); + fill_with_zeros(output_accessor_gpu); Kernels::Replicate::forward_kernel( managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); @@ -88,29 +84,29 @@ TEST_SUITE(FF_TEST_SUITE) { load_accessor_data(output_accessor_gpu); // Run CPU Replicate Forward Kernel - GenericTensorAccessorW input_accessor_cpu = - copy_tensor_between_memories(input_accessor_gpu, - cpu_allocator); + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); GenericTensorAccessorW output_accessor_cpu = - cpu_allocator.allocate_tensor(replicated_shape); + cpu_allocator.allocate_tensor(output_shape); + fill_with_zeros(output_accessor_cpu); - Kernels::Replicate::cpu_forward_kernel( - read_only_accessor_from_write_accessor(input_accessor_cpu), - output_accessor_cpu); + Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); std::vector result_data_cpu = load_accessor_data(output_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } SUBCASE("backward_kernel") { // Run GPU Replicate Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = - create_random_filled_accessor_r(replicated_shape, + create_random_filled_accessor_r(output_shape, gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor_and_zero(reduced_shape); + gpu_allocator.allocate_tensor(input_shape); + fill_with_zeros(input_grad_accessor_gpu); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), input_grad_accessor_gpu, @@ -121,21 +117,20 @@ TEST_SUITE(FF_TEST_SUITE) { load_accessor_data(input_grad_accessor_gpu); // Run CPU Replicate Backward Kernel - GenericTensorAccessorW output_grad_accessor_cpu = - copy_tensor_between_memories( - output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor_and_zero(reduced_shape); + cpu_allocator.allocate_tensor(input_shape); + fill_with_zeros(input_grad_accessor_cpu); Kernels::Replicate::cpu_backward_kernel( - input_grad_accessor_cpu, - read_only_accessor_from_write_accessor(output_grad_accessor_cpu), - num_replicas); + input_grad_accessor_cpu, output_grad_accessor_cpu, num_replicas); std::vector result_data_cpu = load_accessor_data(input_grad_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } } } diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index e8b3d9d2f5..016e7b490a 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -32,7 +32,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector expected_output_data( input_accessor.shape.num_elements(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(vectors_are_approx_equal(check_output_data, expected_output_data)); } SUBCASE("backward_kernel") { @@ -52,7 +52,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector expected_grad_input_data( input_grad_accessor.shape.num_elements(), 3.0f); - CHECK(host_grad_input_data == expected_grad_input_data); + CHECK(vectors_are_approx_equal(host_grad_input_data, + expected_grad_input_data)); } } } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index be1d946902..94e6f139ff 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -84,11 +84,12 @@ TEST_SUITE(FF_TEST_SUITE) { }; // Run GPU Cast Forward Kernel - GenericTensorAccessorW input_accessor_gpu = - create_transformed_accessor_w( - input_shape, gpu_allocator, transform); + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, + gpu_allocator); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(output_shape); + fill_with_zeros(output_accessor_gpu); Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), input_accessor_gpu.get_float_ptr(), @@ -102,33 +103,32 @@ TEST_SUITE(FF_TEST_SUITE) { load_accessor_data(output_accessor_gpu); // Run CPU Cast Forward Kernel - GenericTensorAccessorW input_accessor_cpu = - create_transformed_accessor_w( - input_shape, cpu_allocator, transform); + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); GenericTensorAccessorW output_accessor_cpu = cpu_allocator.allocate_tensor(output_shape); + fill_with_zeros(output_accessor_cpu); - Kernels::Reverse::cpu_forward_kernel( - input_accessor_cpu.get_float_ptr(), - output_accessor_cpu.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - input_accessor_cpu.shape.num_elements()); + Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu, + num_out_blks, + reverse_dim_size, + in_blk_size); std::vector result_data_cpu = load_accessor_data(output_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } SUBCASE("backward_kernel") { // Run GPU Cast Backward Kernel - GenericTensorAccessorW output_grad_accessor_gpu = - create_random_filled_accessor_w(output_shape, + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = gpu_allocator.allocate_tensor(input_shape); + fill_with_zeros(input_grad_accessor_gpu); Kernels::Reverse::backward_kernel( managed_stream.raw_stream(), @@ -143,25 +143,22 @@ TEST_SUITE(FF_TEST_SUITE) { load_accessor_data(input_grad_accessor_gpu); // Run CPU Cast Backward Kernel - GenericTensorAccessorW output_grad_accessor_cpu = - copy_tensor_between_memories( - read_only_accessor_from_write_accessor(output_grad_accessor_gpu), - cpu_allocator); + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = cpu_allocator.allocate_tensor(input_shape); + fill_with_zeros(input_grad_accessor_cpu); - Kernels::Reverse::cpu_backward_kernel( - output_grad_accessor_cpu.get_float_ptr(), - input_grad_accessor_cpu.get_float_ptr(), - num_out_blks, - reverse_dim_size, - in_blk_size, - input_grad_accessor_cpu.shape.num_elements()); + Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu, + input_grad_accessor_cpu, + num_out_blks, + reverse_dim_size, + in_blk_size); std::vector result_data_cpu = load_accessor_data(input_grad_accessor_cpu); - CHECK(result_data_gpu == result_data_cpu); + CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); } } } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index c25c2f91d3..e4f73d4747 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -55,7 +55,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector(input_grad_accessor.shape.num_elements(), 1.0f); std::vector host_input_grad_data = load_accessor_data(input_grad_accessor); - CHECK(host_input_grad_data == expected_input_grad_data); + CHECK(vectors_are_approx_equal(host_input_grad_data, + expected_input_grad_data)); } } } diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index 26acbee33c..649f188e9e 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -1,6 +1,7 @@ #include "doctest/doctest.h" #include "kernels/split_kernels.h" #include "test_utils.h" +#include "utils/containers/repeat.h" using namespace ::FlexFlow; diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index b147523604..02421f9bc5 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -1,19 +1,86 @@ #include "test_utils.h" -GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, - Allocator &allocator) { - GenericTensorAccessorW accessor = - create_random_filled_accessor_w(shape, allocator); +namespace FlexFlow { - return read_only_accessor_from_write_accessor(accessor); +bool device_on_cpu(DeviceType device_type) { + return device_type == DeviceType::CPU; } -TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims, - DataType DT) { +bool device_on_gpu(DeviceType device_type) { + return device_type == DeviceType::GPU; +} + +TensorShape + make_tensor_shape_from_legion_dims(LegionOrdered const &dims, + DataType DT) { return TensorShape{ TensorDims{ - dims, + ff_ordered_from_legion_ordered(dims), }, DT, }; } + +template +struct CopyTensorAccessorW { + GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW copied_tensor = allocator.allocate_tensor(shape); + + transfer_memory( + copied_tensor, src_accessor.get
(), src_accessor.device_type); + + return copied_tensor; + } +}; + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, std::ref(allocator)); +} + +template +struct CopyTensorAccessorR { + GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW copied_tensor = allocator.allocate_tensor(shape); + + transfer_memory( + copied_tensor, src_accessor.get
(), src_accessor.device_type); + + return read_only_accessor_from_write_accessor(copied_tensor); + } +}; + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, std::ref(allocator)); +} + +template +struct FillWithZeros { + void operator()(GenericTensorAccessorW const &accessor) { + using T = real_type_t
; + + if (accessor.device_type == DeviceType::CPU) { + memset(accessor.ptr, 0, accessor.shape.get_volume() * sizeof(T)); + } else { + checkCUDA( + cudaMemset(accessor.ptr, 0, accessor.shape.get_volume() * sizeof(T))); + } + } +}; + +void fill_with_zeros(GenericTensorAccessorW const &accessor) { + DataTypeDispatch1{}(accessor.data_type, accessor); +} + +} // namespace FlexFlow diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index 4426ba2df8..22dda0029a 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -1,28 +1,48 @@ #ifndef _FLEXFLOW_KERNELS_TEST_UTILS #define _FLEXFLOW_KERNELS_TEST_UTILS +#include "kernels/datatype_dispatch.h" #include "kernels/device.h" #include "kernels/local_cpu_allocator.h" #include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" +#include "op-attrs/datatype.h" +#include "utils/containers/all_of.h" #include +namespace FlexFlow { + +bool device_on_cpu(DeviceType); +bool device_on_gpu(DeviceType); + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator); + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator); + +TensorShape + make_tensor_shape_from_legion_dims(LegionOrdered const &dims, + DataType DT); + +void fill_with_zeros(GenericTensorAccessorW const &accessor); + template void transfer_memory(GenericTensorAccessorW dst_accessor, const DT *src, - AllocLocation src_loc) { + DeviceType src_device_type) { size_t bytes = dst_accessor.shape.get_volume() * sizeof(DT); - AllocLocation dst_loc = - dst_accessor.on_device ? AllocLocation::DEVICE : AllocLocation::HOST; - if (src_loc == AllocLocation::HOST && dst_loc == AllocLocation::HOST) { + DeviceType dst_device_type = dst_accessor.device_type; + + if (device_on_cpu(src_device_type) && device_on_cpu(dst_device_type)) { memcpy(dst_accessor.ptr, src, bytes); - } else if (src_loc == AllocLocation::HOST && - dst_loc == AllocLocation::DEVICE) { + } else if (device_on_cpu(src_device_type) && device_on_gpu(dst_device_type)) { checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyHostToDevice)); - } else if (src_loc == AllocLocation::DEVICE && - dst_loc == AllocLocation::HOST) { + } else if (device_on_gpu(src_device_type) && device_on_cpu(dst_device_type)) { checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToHost)); } else { checkCUDA( @@ -35,11 +55,10 @@ GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, Allocator &allocator) { assert(shape.data_type == DataType::FLOAT || shape.data_type == DataType::DOUBLE); - using T = real_type
; + + using T = real_type_t
; GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - accessor.on_device = - (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; std::vector host_data(accessor.shape.num_elements()); std::random_device rd; @@ -50,7 +69,7 @@ GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, val = dist(gen); } - transfer_memory(accessor, host_data.data(), AllocLocation::HOST); + transfer_memory(accessor, host_data.data(), DeviceType::CPU); return accessor; } @@ -58,103 +77,64 @@ GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, template GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, Allocator &allocator) { + using T = real_type_t
; GenericTensorAccessorW accessor = create_random_filled_accessor_w
(shape, allocator); return read_only_accessor_from_write_accessor(accessor); } -template +template GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, Allocator &allocator, - DT val) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - accessor.on_device = - (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; - - size_t volume = accessor.shape.get_volume(); - std::vector
host_data(volume, val); - - transfer_memory(accessor, host_data.data(), AllocLocation::HOST); - - return accessor; -} - -template -GenericTensorAccessorW create_transformed_accessor_w(TensorShape const &shape, - Allocator &allocator, - F transform) { + T val) { GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - accessor.on_device = - (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; size_t volume = accessor.shape.get_volume(); - std::vector input_data(volume); - std::vector output_data(volume); - - std::transform( - input_data.begin(), input_data.end(), output_data.begin(), transform); + std::vector host_data(volume, val); - transfer_memory(accessor, output_data.data(), AllocLocation::HOST); + transfer_memory(accessor, host_data.data(), DeviceType::CPU); return accessor; } template -GenericTensorAccessorW - copy_tensor_between_memories(GenericTensorAccessorR accessor, - Allocator &allocator) { - TensorShape shape = get_tensor_shape(accessor.shape, accessor.data_type); - GenericTensorAccessorW copied_accessor = allocator.allocate_tensor(shape); - copied_accessor.on_device = - (allocator.alloc_location == AllocLocation::DEVICE) ? true : false; - - AllocLocation src_loc = - accessor.on_device ? AllocLocation::DEVICE : AllocLocation::HOST; - - transfer_memory(copied_accessor, accessor.get
(), src_loc); - - return copied_accessor; -} - -TensorShape make_tensor_shape_from_legion_dims(FFOrdered dims, - DataType DT); - -template -std::vector> load_accessor_data(GenericTensorAccessorR accessor) { - using T = real_type
; +std::vector> + load_accessor_data(GenericTensorAccessorR accessor) { + using T = real_type_t
; int volume = accessor.shape.get_volume(); std::vector local_data(volume); T const *src_ptr = accessor.get
(); - if (accessor.on_device) { + if (device_on_cpu(accessor.device_type)) { + memcpy(local_data.data(), src_ptr, volume * sizeof(T)); + } else { checkCUDA(cudaMemcpy(local_data.data(), src_ptr, volume * sizeof(T), cudaMemcpyDeviceToHost)); - } else { - memcpy(local_data.data(), src_ptr, volume * sizeof(T)); } return local_data; } template -std::vector> load_accessor_data(GenericTensorAccessorW accessor) { - using T = real_type
; +std::vector> + load_accessor_data(GenericTensorAccessorW accessor) { + using T = real_type_t
; int volume = accessor.shape.get_volume(); std::vector local_data(volume); T const *src_ptr = accessor.get
(); - if (accessor.on_device) { + if (device_on_cpu(accessor.device_type)) { + memcpy(local_data.data(), src_ptr, volume * sizeof(T)); + } else { checkCUDA(cudaMemcpy(local_data.data(), src_ptr, volume * sizeof(T), cudaMemcpyDeviceToHost)); - } else { - memcpy(local_data.data(), src_ptr, volume * sizeof(T)); } return local_data; @@ -165,4 +145,17 @@ bool contains_non_zero(std::vector &data) { return !all_of(data, [](T const &val) { return val == 0; }); } +template +bool vectors_are_approx_equal(T lhs, T rhs) { + float epsilon = 0.0001f; + return std::equal( + lhs.begin(), + lhs.end(), + rhs.begin(), + rhs.end(), + [epsilon](float a, float b) { return std::abs(a - b) < epsilon; }); +} + +} // namespace FlexFlow + #endif diff --git a/lib/local-execution/include/local-execution/local_cpu_allocator.h b/lib/local-execution/include/local-execution/local_cpu_allocator.h index d1e81facf2..cf6cfe35d1 100644 --- a/lib/local-execution/include/local-execution/local_cpu_allocator.h +++ b/lib/local-execution/include/local-execution/local_cpu_allocator.h @@ -12,6 +12,8 @@ struct LocalCPUAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: std::unordered_map> ptrs; }; diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h index d6f338fe14..f697337c52 100644 --- a/lib/local-execution/include/local-execution/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -12,8 +12,10 @@ struct TrackedAllocator : public IAllocator { ~TrackedAllocator() = default; void *allocate(size_t) override; - void *allocate_and_zero(size_t) override; void deallocate(void *) override; + + DeviceType get_allocation_device_type() const override; + size_t get_current_mem_usage(); private: diff --git a/lib/local-execution/src/local_cpu_allocator.cc b/lib/local-execution/src/local_cpu_allocator.cc index 4ca5f987a8..c4657e26b5 100644 --- a/lib/local-execution/src/local_cpu_allocator.cc +++ b/lib/local-execution/src/local_cpu_allocator.cc @@ -17,6 +17,10 @@ void LocalCPUAllocator::deallocate(void *ptr) { } } +DeviceType LocalCPUAllocator::get_allocation_device_type() const { + return DeviceType::CPU; +} + Allocator create_local_cpu_memory_allocator() { return Allocator::create(); } diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc index 5d0156201e..47b50e9f50 100644 --- a/lib/local-execution/src/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local_task_argument_accessor.cc @@ -25,7 +25,10 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( this->tensor_slots_backing.at(slot_grad_pair)); if (priv == Permissions::RO) { GenericTensorAccessorR readonly_tensor_backing = { - tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}; + tensor_backing.data_type, + tensor_backing.shape, + tensor_backing.ptr, + this->allocator.get_allocation_device_type()}; return readonly_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { return tensor_backing; @@ -33,6 +36,7 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( throw mk_runtime_error("Unhandled privilege mode {}", priv); } } + VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( slot_id_t slot, Permissions priv, IsGrad is_grad) const { SlotGradId slot_grad_pair = SlotGradId{slot, is_grad}; @@ -43,7 +47,10 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( for (GenericTensorAccessorW const &tensor_backing : variadic_tensor_backing) { readonly_variadic_tensor_backing.push_back( - {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}); + {tensor_backing.data_type, + tensor_backing.shape, + tensor_backing.ptr, + this->allocator.get_allocation_device_type()}); } return readonly_variadic_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index 9f13f006f3..ed181aea32 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -12,12 +12,6 @@ void *TrackedAllocator::allocate(size_t requested_memory_size) { return ptr; } -void *TrackedAllocator::allocate_and_zero(size_t requested_memory_size) { - void *ptr = this->allocator.allocate_and_zero(requested_memory_size); - this->current_mem_usage += requested_memory_size; - return ptr; -} - void TrackedAllocator::deallocate(void *ptr) { size_t psize; this->ptr_mem_usage.erase(ptr); @@ -29,9 +23,12 @@ size_t TrackedAllocator::get_current_mem_usage() { return this->current_mem_usage; } +DeviceType TrackedAllocator::get_allocation_device_type() const { + return this->allocator.get_allocation_device_type(); +} + Allocator get_tracked_memory_allocator(Allocator const &base_allocator) { Allocator allocator = Allocator::create(base_allocator); - allocator.alloc_location = base_allocator.alloc_location; return allocator; } diff --git a/lib/pcg/src/strided_rectangle.cc b/lib/pcg/src/strided_rectangle.cc deleted file mode 100644 index 40307aea88..0000000000 --- a/lib/pcg/src/strided_rectangle.cc +++ /dev/null @@ -1,36 +0,0 @@ -#include "pcg/strided_rectangle.h" -#include "op-attrs/dim_ordered/transform.h" -#include "utils/containers.h" - -namespace FlexFlow { - -/* size_t StridedRectangle::at(FFOrdered const &coord) const { */ -/* assert(coord.size() == this->num_dims()); */ - -/* size_t _1d_stride = 1; */ -/* size_t idx = 0; */ -/* for (auto dim : inner_to_outer_idxs(this->sides)) { */ -/* idx += this->sides.at(dim).at(coord.at(dim)).value() * _1d_stride; */ -/* _1d_stride *= this->sides.at(dim).get_size().value(); */ -/* } */ -/* return idx; */ -/* } */ - -size_t get_num_dims(StridedRectangle const &rect) { - return rect.sides.size(); -} - -num_points_t get_num_points(StridedRectangle const &rect) { - return num_points_t{ - product(transform(rect.sides, [](StridedRectangleSide const &side) { - return side.num_points.unwrapped; - }))}; -} - -StridedRectangleSide get_side_at_idx(StridedRectangle const &rect, - ff_dim_t const &idx) { - return rect.sides.at(idx); -} - -} // namespace FlexFlow - From d50914c17fcd6d716c5f7eea0f5913a49fce2d46 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 8 Oct 2024 00:26:05 -0700 Subject: [PATCH 08/20] accessor.h formatting --- lib/kernels/include/kernels/accessor.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 25ba307c41..356526122e 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -32,7 +32,10 @@ class GenericTensorAccessorW { GenericTensorAccessorW() = delete; - GenericTensorAccessorW(DataType data_type, ArrayShape const &shape, void *ptr, DeviceType device_type); + GenericTensorAccessorW(DataType data_type, + ArrayShape const &shape, + void *ptr, + DeviceType device_type); bool operator==(GenericTensorAccessorW const &) const; bool operator!=(GenericTensorAccessorW const &) const; From f1f269897e715dd3ec2072df4754bbe4d9953895 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 8 Oct 2024 00:47:55 -0700 Subject: [PATCH 09/20] mk_runtime_error formatting --- lib/kernels/include/kernels/accessor.h | 10 +++++----- lib/kernels/src/accessor.cc | 16 ++++++++-------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 356526122e..88a47f97e8 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -46,8 +46,8 @@ class GenericTensorAccessorW { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } if (this->data_type != DT) { - throw mk_runtime_error( - "Invalid access data type ({} != {})", this->data_type, DT); + throw mk_runtime_error(fmt::format( + "Invalid access data type ({} != {})", this->data_type, DT)); } using T = real_type_t
; @@ -64,8 +64,8 @@ class GenericTensorAccessorW { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } if (this->data_type != DT) { - throw mk_runtime_error( - "Invalid access data type ({} != {})", this->data_type, DT); + throw mk_runtime_error(fmt::format( + "Invalid access data type ({} != {})", this->data_type, DT)); } using T = real_type_t
; @@ -131,7 +131,7 @@ class GenericTensorAccessorR { } if (this->data_type != DT) { throw mk_runtime_error( - "Invalid access data type ({} != {})", this->data_type, DT); + fmt::format("Invalid access data type ({} != {})", this->data_type, DT)); } using T = real_type_t
; diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index c0b11a2299..a2b3e94d33 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -21,10 +21,10 @@ size_t GenericTensorAccessorW::calculate_index_offset( std::initializer_list const &indices) const { if (indices.size() != this->shape.num_dims()) { - throw mk_runtime_error( + throw mk_runtime_error(fmt::format( "Number of indices ({}) does not match the number of dimensions ({}).", indices.size(), - this->shape.num_dims()); + this->shape.num_dims())); } size_t offset = 0; @@ -36,11 +36,11 @@ size_t GenericTensorAccessorW::calculate_index_offset( cur_idx = *it--; if (cur_idx >= this->shape[legion_dim_t(i)]) { - throw mk_runtime_error("In {} dimension, attempting to access index {} " + throw mk_runtime_error(fmt::format("In {} dimension, attempting to access index {} " "when only {} indexes exist", i, cur_idx, - this->shape[legion_dim_t(i)]); + this->shape[legion_dim_t(i)])); } offset += cur_idx * multiplier; @@ -110,10 +110,10 @@ size_t GenericTensorAccessorR::calculate_index_offset( std::initializer_list const &indices) const { if (indices.size() != this->shape.num_dims()) { - throw mk_runtime_error( + throw mk_runtime_error(fmt::format( "Number of indices ({}) does not match the number of dimensions ({}).", indices.size(), - this->shape.num_dims()); + this->shape.num_dims())); } size_t offset = 0; @@ -125,11 +125,11 @@ size_t GenericTensorAccessorR::calculate_index_offset( cur_idx = *it--; if (cur_idx >= this->shape[legion_dim_t(i)]) { - throw mk_runtime_error("In {} dimension, attempting to access index {} " + throw mk_runtime_error(fmt::format("In {} dimension, attempting to access index {} " "when only {} indexes exist", i, cur_idx, - this->shape[legion_dim_t(i)]); + this->shape[legion_dim_t(i)])); } offset += cur_idx * multiplier; From a7422f776a02ce29127730b8920234f696e8668f Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 8 Oct 2024 01:08:59 -0700 Subject: [PATCH 10/20] reverse_kernels include --- lib/kernels/src/cpu/reverse_kernels.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index 1971435d8c..afa92b307c 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -1,4 +1,5 @@ #include "kernels/reverse_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" #include #include From 5863880d5651f05825f8fbe266e9703390111308 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 14 Oct 2024 23:40:12 -0700 Subject: [PATCH 11/20] test_utils refactor and clarity --- lib/kernels/include/kernels/accessor.h | 22 ++- lib/kernels/include/kernels/allocation.h | 2 +- .../include/kernels/replicate_kernels.h | 2 +- .../include/kernels/replicate_kernels_cpu.h | 6 +- .../include/kernels/reverse_kernels_cpu.h | 14 +- lib/kernels/src/accessor.cc | 130 +++++++++--- lib/kernels/src/allocation.cc | 6 +- lib/kernels/src/cpu/replicate_kernels.cc | 53 +++-- lib/kernels/src/cpu/reverse_kernels.cc | 63 ++---- lib/kernels/src/cuda/ops/replicate_kernels.cu | 6 +- lib/kernels/src/cuda/ops/reverse_kernels.cu | 13 +- lib/kernels/test/src/test_attention_kernel.cc | 30 +-- .../test/src/test_batch_matmul_kernel.cc | 12 +- .../test/src/test_batch_norm_kernel.cc | 39 ++-- lib/kernels/test/src/test_cast_kernel.cc | 74 +++---- lib/kernels/test/src/test_combine_kernel.cc | 44 ++--- lib/kernels/test/src/test_concat_kernel.cc | 11 +- lib/kernels/test/src/test_dropout.cc | 15 +- lib/kernels/test/src/test_flat_kernel.cc | 21 +- lib/kernels/test/src/test_gather_kernels.cc | 20 +- .../test/src/test_layer_norm_kernels.cc | 13 +- lib/kernels/test/src/test_partition_kernel.cc | 24 +-- lib/kernels/test/src/test_pool_2d_kernels.cc | 16 +- lib/kernels/test/src/test_reduction_kernel.cc | 16 +- lib/kernels/test/src/test_replicate_kernel.cc | 66 ++----- lib/kernels/test/src/test_reshape_kernel.cc | 23 +-- lib/kernels/test/src/test_reverse_kernels.cc | 65 ++---- lib/kernels/test/src/test_softmax_kernel.cc | 21 +- lib/kernels/test/src/test_split_kernel.cc | 8 +- lib/kernels/test/src/test_transpose_kernel.cc | 17 +- lib/kernels/test/src/test_utils.cc | 161 +++++++++++---- lib/kernels/test/src/test_utils.h | 185 +++++++----------- .../src/local_task_argument_accessor.cc | 12 +- lib/local-execution/src/ops/replicate.cc | 2 +- 34 files changed, 559 insertions(+), 653 deletions(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index e29f73924c..0a134db695 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -11,6 +11,8 @@ namespace FlexFlow { +struct Allocator; + class GenericTensorAccessorW { public: template @@ -129,8 +131,8 @@ class GenericTensorAccessorR { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } if (this->data_type != DT) { - throw mk_runtime_error( - fmt::format("Invalid access data type ({} != {})", this->data_type, DT)); + throw mk_runtime_error(fmt::format( + "Invalid access data type ({} != {})", this->data_type, DT)); } using T = real_type_t
; @@ -255,6 +257,22 @@ std::pair std::pair get_shape_and_datatype(GenericTensorAccessorW const &accessor); +void transfer_data_between_accessors( + GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor); + +void transfer_data_between_accessors( + GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorW const &src_accessor); + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator); + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator); + } // namespace FlexFlow namespace FlexFlow { diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h index 893be513ea..4bf97118ce 100644 --- a/lib/kernels/include/kernels/allocation.h +++ b/lib/kernels/include/kernels/allocation.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_ALLOCATION_H #define _FLEXFLOW_KERNELS_ALLOCATION_H -#include "accessor.h" +#include "kernels/accessor.h" #include #include diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h index 877eeabf04..7ed55cd1a1 100644 --- a/lib/kernels/include/kernels/replicate_kernels.h +++ b/lib/kernels/include/kernels/replicate_kernels.h @@ -11,8 +11,8 @@ void forward_kernel(ffStream_t stream, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas); } // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h index a72b799875..1c7aa4ee4a 100644 --- a/lib/kernels/include/kernels/replicate_kernels_cpu.h +++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h @@ -7,10 +7,10 @@ namespace FlexFlow::Kernels::Replicate { void cpu_forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); + GenericTensorAccessorW &output); -void cpu_backward_kernel(GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output, +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, size_t num_replicas); } // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h index b0edaa264c..35af06aafb 100644 --- a/lib/kernels/include/kernels/reverse_kernels_cpu.h +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -1,22 +1,16 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H -#include "accessor.h" -#include "device.h" +#include "kernels/accessor.h" +#include "kernels/device.h" namespace FlexFlow::Kernels::Reverse { void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, - GenericTensorAccessorW &output_accessor, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size); + GenericTensorAccessorW &output_accessor); void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, - GenericTensorAccessorW &input_accessor, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size); + GenericTensorAccessorW &input_accessor); } // namespace FlexFlow::Kernels::Reverse #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index a2b3e94d33..9332dd6703 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -1,7 +1,45 @@ #include "kernels/accessor.h" +#include "kernels/allocation.h" +#include "kernels/datatype_dispatch.h" namespace FlexFlow { +void transfer_data_between_accessors( + GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor) { + size_t num_bytes = dst_accessor.shape.get_volume() * + size_of_datatype(dst_accessor.data_type); + + DeviceType dst_device_type = dst_accessor.device_type; + DeviceType src_device_type = src_accessor.device_type; + + if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::CPU) { + memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes); + } else if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::GPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice)); + } else if (src_device_type == DeviceType::GPU && + dst_device_type == DeviceType::CPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost)); + } else { + checkCUDA(cudaMemcpy(dst_accessor.ptr, + src_accessor.ptr, + num_bytes, + cudaMemcpyDeviceToDevice)); + } +} + +void transfer_data_between_accessors( + GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorW const &src_accessor) { + GenericTensorAccessorR r_src_accessor = + read_only_accessor_from_write_accessor(src_accessor); + transfer_data_between_accessors(dst_accessor, r_src_accessor); +} + GenericTensorAccessorW::GenericTensorAccessorW( DataType data_type, ArrayShape const &shape, @@ -30,21 +68,22 @@ size_t GenericTensorAccessorW::calculate_index_offset( size_t offset = 0; size_t multiplier = 1; size_t cur_idx; - auto it = indices.end() - 1; - - for (std::size_t i = this->shape.num_dims(); i-- > 0;) { - cur_idx = *it--; - - if (cur_idx >= this->shape[legion_dim_t(i)]) { - throw mk_runtime_error(fmt::format("In {} dimension, attempting to access index {} " - "when only {} indexes exist", - i, - cur_idx, - this->shape[legion_dim_t(i)])); + auto it = indices.begin(); + + for (size_t i = 0; i < this->shape.num_dims(); i++) { + cur_idx = *it++; + + if (cur_idx >= this->shape.at(legion_dim_t(i))) { + throw mk_runtime_error( + fmt::format("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + cur_idx, + this->shape.at(legion_dim_t(i)))); } offset += cur_idx * multiplier; - multiplier *= this->shape[legion_dim_t(i)]; + multiplier *= this->shape.at(legion_dim_t(i)); } return offset; @@ -119,21 +158,22 @@ size_t GenericTensorAccessorR::calculate_index_offset( size_t offset = 0; size_t multiplier = 1; size_t cur_idx; - auto it = indices.end() - 1; - - for (std::size_t i = this->shape.num_dims(); i-- > 0;) { - cur_idx = *it--; - - if (cur_idx >= this->shape[legion_dim_t(i)]) { - throw mk_runtime_error(fmt::format("In {} dimension, attempting to access index {} " - "when only {} indexes exist", - i, - cur_idx, - this->shape[legion_dim_t(i)])); + auto it = indices.begin(); + + for (size_t i = 0; i < this->shape.num_dims(); i++) { + cur_idx = *it++; + + if (cur_idx >= this->shape.at(legion_dim_t(i))) { + throw mk_runtime_error( + fmt::format("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + cur_idx, + this->shape.at(legion_dim_t(i)))); } offset += cur_idx * multiplier; - multiplier *= this->shape[legion_dim_t(i)]; + multiplier *= this->shape.at(legion_dim_t(i)); } return offset; @@ -307,4 +347,46 @@ std::pair return std::make_pair(accessor.shape, accessor.data_type); } +template +struct CopyTensorAccessorW { + GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + transfer_data_between_accessors(dst_accessor, src_accessor); + + return dst_accessor; + } +}; + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, std::ref(allocator)); +} + +template +struct CopyTensorAccessorR { + GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + transfer_data_between_accessors(dst_accessor, src_accessor); + + return read_only_accessor_from_write_accessor(dst_accessor); + } +}; + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, std::ref(allocator)); +} + } // namespace FlexFlow diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index 751cdc0ebb..733146851a 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -18,8 +18,10 @@ DeviceType Allocator::get_allocation_device_type() const { GenericTensorAccessorW Allocator::allocate_tensor(TensorShape const &tensor_shape) { void *ptr = this->allocate(get_size_in_bytes(tensor_shape)); - return { - tensor_shape.data_type, tensor_shape, ptr, get_allocation_device_type()}; + return {tensor_shape.data_type, + tensor_shape, + ptr, + this->get_allocation_device_type()}; } } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index 5853869047..683739b91e 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -3,52 +3,43 @@ namespace FlexFlow::Kernels::Replicate { -template -void cpu_replicate_backward_kernel(T *input, - T const *output, - size_t num_elements, - size_t num_replicas) { - for (size_t i = 0; i < num_elements; i++) { - T sum = 0; - for (size_t j = 0; j < num_replicas; j++) { - sum += output[i + j * num_elements]; - } - input[i] = sum; - } -} - -template +template struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - memcpy(output.get(), - input.get(), - input.shape.num_elements() * size_of_datatype(T)); + GenericTensorAccessorW &output) { + memcpy(output.get
(), + input.get
(), + input.shape.num_elements() * size_of_datatype(DT)); } }; -template +template struct CPUBackwardKernel { - void operator()(GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output, + void operator()(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, size_t num_replicas) { - cpu_replicate_backward_kernel(input.get(), - output.get(), - input.shape.num_elements(), - num_replicas); + using T = real_type_t
; + for (size_t i = 0; i < input.shape.num_elements(); i++) { + T cur_sum = 0; + for (size_t j = 0; j < num_replicas; j++) { + cur_sum += output.at
(i, j); + } + input.at
(i) = cur_sum; + } } }; void cpu_forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(input.data_type, input, output); + GenericTensorAccessorW &output) { + DataTypeDispatch1{}( + input.data_type, input, std::ref(output)); } -void cpu_backward_kernel(GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output, +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, size_t num_replicas) { DataTypeDispatch1{}( - input.data_type, input, output, num_replicas); + input.data_type, output, std::ref(input), num_replicas); } } // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index afa92b307c..bc114c4e60 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -1,5 +1,5 @@ -#include "kernels/reverse_kernels_cpu.h" #include "kernels/datatype_dispatch.h" +#include "kernels/reverse_kernels_cpu.h" #include #include @@ -8,31 +8,20 @@ namespace FlexFlow::Kernels::Reverse { template struct CPUReverseForwardKernel { void operator()(GenericTensorAccessorR const &input, - GenericTensorAccessorW &output, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size) { + GenericTensorAccessorW &output) { assert(input.data_type == DT && output.data_type == DT); - // For each output block, copy the input block - for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { - for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; ++rev_idx) { - for (coord_t i = 0; i < in_blk_size; ++i) { - output.at
(blk_idx, rev_idx, i) = - input.at
(blk_idx, rev_idx, i); - } - } - } - - // Reverse the blocks within each output block - for (coord_t blk_idx = 0; blk_idx < num_out_blks; ++blk_idx) { - for (coord_t rev_idx = 0; rev_idx < reverse_dim_size / 2; ++rev_idx) { - coord_t start_idx = rev_idx; - coord_t end_idx = reverse_dim_size - 1 - rev_idx; - - for (coord_t i = 0; i < in_blk_size; ++i) { - std::swap(output.at
(blk_idx, start_idx, i), - output.at
(blk_idx, end_idx, i)); + coord_t num_out_blocks = input.shape.at(legion_dim_t(0)); + coord_t reverse_dim_size = input.shape.at(legion_dim_t(1)); + coord_t in_block_size = input.shape.at(legion_dim_t(2)); + + for (coord_t block_idx = 0; block_idx < num_out_blocks; block_idx++) { + for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { + for (coord_t i = 0; i < in_block_size; i++) { + output.at
(block_idx, rev_idx, i) = + input.at
(num_out_blocks - 1 - block_idx, + reverse_dim_size - 1 - rev_idx, + in_block_size - 1 - i); } } } @@ -40,29 +29,15 @@ struct CPUReverseForwardKernel { }; void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, - GenericTensorAccessorW &output_accessor, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size) { - DataTypeDispatch1{}(input_accessor.data_type, - input_accessor, - std::ref(output_accessor), - num_out_blks, - reverse_dim_size, - in_blk_size); + GenericTensorAccessorW &output_accessor) { + DataTypeDispatch1{}( + input_accessor.data_type, input_accessor, std::ref(output_accessor)); } void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, - GenericTensorAccessorW &input_accessor, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size) { - DataTypeDispatch1{}(output_accessor.data_type, - output_accessor, - std::ref(input_accessor), - num_out_blks, - reverse_dim_size, - in_blk_size); + GenericTensorAccessorW &input_accessor) { + DataTypeDispatch1{}( + output_accessor.data_type, output_accessor, std::ref(input_accessor)); } } // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu index 76bfbe2658..1aa61375f0 100644 --- a/lib/kernels/src/cuda/ops/replicate_kernels.cu +++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu @@ -50,8 +50,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas) { size_t total_elements = input.shape.num_elements() * num_replicas; replicate_backward_kernel> @@ -70,11 +70,11 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas) { DataTypeDispatch1{}( - input.data_type, stream, input, output, num_replicas); + input.data_type, stream, output, input, num_replicas); } } // namespace Replicate diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index f73c57dedf..8e93fec0d6 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -27,6 +27,7 @@ namespace Reverse { // coord_t reverse_dim_size, // coord_t in_blk_size) { // CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { +// coord_t out_idx = i; // coord_t blk_idx = i / (reverse_dim_size * in_blk_size); // i = i - blk_idx * (reverse_dim_size * in_blk_size); // coord_t reverse_dim_idx = i / in_blk_size; @@ -34,8 +35,18 @@ namespace Reverse { // coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + // (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + // i; -// out_ptr[i] = in_ptr[in_idx]; +// out_ptr[out_idx] = in_ptr[in_idx]; // } +// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { +// coord_t blk_idx = i / (reverse_dim_size * in_blk_size); +// i = i - blk_idx * (reverse_dim_size * in_blk_size); +// coord_t reverse_dim_idx = i / in_blk_size; +// i = i - reverse_dim_idx * in_blk_size; +// coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + +// (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + +// i; +// out_ptr[i] = in_ptr[in_idx]; +// } // } /* I mentioned this earlier, but I still think the reverse_forward_kernel code diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index bbb3c62a85..5245fab915 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -45,16 +45,13 @@ TEST_SUITE(FF_TEST_SUITE) { make_tensor_shape_from_legion_dims({state.weightSize}, DataType::FLOAT); GenericTensorAccessorW query_accessor = - create_random_filled_accessor_w(query_shape, - allocator); + create_random_filled_accessor_w(query_shape, allocator); GenericTensorAccessorW key_accessor = - create_random_filled_accessor_w(key_shape, allocator); + create_random_filled_accessor_w(key_shape, allocator); GenericTensorAccessorW value_accessor = - create_random_filled_accessor_w(value_shape, - allocator); + create_random_filled_accessor_w(value_shape, allocator); GenericTensorAccessorW weight_accessor = - create_random_filled_accessor_w(weight_shape, - allocator); + create_random_filled_accessor_w(weight_shape, allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -69,27 +66,20 @@ TEST_SUITE(FF_TEST_SUITE) { weight_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output = - load_accessor_data(output_accessor); - CHECK(contains_non_zero(host_output)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorW query_grad_accessor = - create_random_filled_accessor_w(query_shape, - allocator); + create_random_filled_accessor_w(query_shape, allocator); GenericTensorAccessorW key_grad_accessor = - create_random_filled_accessor_w(key_shape, - allocator); + create_random_filled_accessor_w(key_shape, allocator); GenericTensorAccessorW value_grad_accessor = - create_random_filled_accessor_w(value_shape, - allocator); + create_random_filled_accessor_w(value_shape, allocator); GenericTensorAccessorW weight_grad_accessor = - create_random_filled_accessor_w(weight_shape, - allocator); + create_random_filled_accessor_w(weight_shape, allocator); GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); Kernels::MultiHeadAttention::backward_kernel( managed_stream.raw_stream(), diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index e64941b574..c08e08fd08 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -27,14 +27,11 @@ TEST_SUITE(FF_TEST_SUITE) { make_tensor_shape_from_legion_dims({m, n, batch}, DataType::FLOAT); GenericTensorAccessorW a_accessor = - create_random_filled_accessor_w(input_shape_a, - allocator); + create_random_filled_accessor_w(input_shape_a, allocator); GenericTensorAccessorW b_accessor = - create_random_filled_accessor_w(input_shape_b, - allocator); + create_random_filled_accessor_w(input_shape_b, allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { Kernels::BatchMatmul::forward_kernel(managed_stream.raw_stream(), @@ -53,8 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW o_grad_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW a_grad_accessor = allocator.allocate_tensor(input_shape_a); GenericTensorAccessorW b_grad_accessor = diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 5135d703fd..a8a26b8eaf 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -33,17 +33,15 @@ TEST_SUITE(FF_TEST_SUITE) { {output_n, output_c, output_h, output_w}, DataType::FLOAT); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW scale_accessor = - create_filled_accessor_w(scale_shape, allocator, 1.0f); + create_filled_accessor_w(scale_shape, allocator, 1.0f); SUBCASE("forward_kernel") { GenericTensorAccessorW bias_accessor = - create_filled_accessor_w(bias_shape, allocator, 0.0f); + create_filled_accessor_w(bias_shape, allocator, 0.0f); Kernels::BatchNorm::forward_kernel(managed_stream.raw_stream(), state, @@ -52,24 +50,18 @@ TEST_SUITE(FF_TEST_SUITE) { scale_accessor.get_float_ptr(), bias_accessor.get_float_ptr()); - std::vector host_output_data = - load_accessor_data(output_accessor); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW scale_grad_accessor = - create_random_filled_accessor_w(scale_shape, - allocator); + create_random_filled_accessor_w(scale_shape, allocator); GenericTensorAccessorW bias_grad_accessor = - create_random_filled_accessor_w(bias_shape, - allocator); + create_random_filled_accessor_w(bias_shape, allocator); Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(), state, @@ -82,16 +74,9 @@ TEST_SUITE(FF_TEST_SUITE) { bias_grad_accessor.get_float_ptr(), input_accessor.shape.num_elements()); - std::vector host_input_grad_data = - load_accessor_data(input_grad_accessor); - std::vector host_scale_grad_data = - load_accessor_data(scale_grad_accessor); - std::vector host_bias_grad_data = - load_accessor_data(bias_grad_accessor); - - CHECK(contains_non_zero(host_input_grad_data)); - CHECK(contains_non_zero(host_scale_grad_data)); - CHECK(contains_non_zero(host_bias_grad_data)); + CHECK(contains_non_zero(input_grad_accessor)); + CHECK(contains_non_zero(scale_grad_accessor)); + CHECK(contains_non_zero(bias_grad_accessor)); } Kernels::BatchNorm::cleanup_kernel(allocator, diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index b77e743b62..c5b1d98bb1 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -17,8 +17,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - create_random_filled_accessor_r(input_shape, - allocator); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -28,19 +27,14 @@ TEST_SUITE(FF_TEST_SUITE) { DataType::FLOAT, DataType::DOUBLE); - std::vector host_double_data = - load_accessor_data(output_accessor); - - CHECK(contains_non_zero(host_double_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR grad_output_accessor = - create_random_filled_accessor_r(output_shape, - allocator); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW grad_input_accessor = - allocator.allocate_tensor(input_shape); - fill_with_zeros(grad_input_accessor); + create_zero_filled_accessor_w(input_shape, allocator); Kernels::Cast::backward_kernel(managed_stream.raw_stream(), grad_output_accessor, @@ -48,9 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) { DataType::DOUBLE, DataType::FLOAT); - std::vector host_grad_float_data = - load_accessor_data(grad_input_accessor); - CHECK(contains_non_zero(host_grad_float_data)); + CHECK(contains_non_zero(grad_input_accessor)); } } @@ -61,45 +53,37 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); TensorShape input_shape = - make_tensor_shape_from_legion_dims({100, 100}, DataType::FLOAT); + make_tensor_shape_from_legion_dims({10, 2}, DataType::FLOAT); TensorShape output_shape = - make_tensor_shape_from_legion_dims({100, 100}, DataType::INT32); - - GenericTensorAccessorW output_accessor_gpu = - gpu_allocator.allocate_tensor(output_shape); - GenericTensorAccessorW output_accessor_cpu = - cpu_allocator.allocate_tensor(output_shape); + make_tensor_shape_from_legion_dims({10, 2}, DataType::DOUBLE); // Only calling forward kernel as backward kernel is exactly the same SUBCASE("forward_kernel") { // Run GPU Forward Kernel - GenericTensorAccessorW input_accessor_gpu = - create_random_filled_accessor_w(input_shape, - gpu_allocator); - Kernels::Cast::forward_kernel( - managed_stream.raw_stream(), - read_only_accessor_from_write_accessor(input_accessor_gpu), - output_accessor_gpu, - DataType::FLOAT, - DataType::INT32); - - std::vector result_data_gpu = - load_accessor_data(output_accessor_gpu); + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Cast::forward_kernel(managed_stream.raw_stream(), + input_accessor_gpu, + output_accessor_gpu, + DataType::FLOAT, + DataType::DOUBLE); // Run CPU Forward Kernel - GenericTensorAccessorW input_accessor_cpu = - create_random_filled_accessor_w(input_shape, - cpu_allocator); - Kernels::Cast::cpu_forward_kernel( - read_only_accessor_from_write_accessor(input_accessor_cpu), - output_accessor_cpu, - DataType::FLOAT, - DataType::INT32); - - std::vector result_data_cpu = - load_accessor_data(output_accessor_cpu); - - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Cast::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu, + DataType::FLOAT, + DataType::DOUBLE); + + CHECK(w_accessors_are_equal(output_accessor_gpu, + output_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 8999a45b06..89d06dff96 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -17,23 +17,19 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - create_random_filled_accessor_r(input_shape, - allocator); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Combine::forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor); - std::vector host_output_data = - load_accessor_data(output_accessor); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - create_random_filled_accessor_r(output_shape, - allocator); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -41,9 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor, input_grad_accessor); - std::vector host_input_grad = - load_accessor_data(input_grad_accessor); - CHECK(contains_non_zero(host_input_grad)); + CHECK(contains_non_zero(input_grad_accessor)); } } @@ -60,17 +54,13 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { // Run GPU Combine Forward Kernel GenericTensorAccessorR input_accessor_gpu = - create_random_filled_accessor_r(input_shape, - gpu_allocator); + create_random_filled_accessor_r(input_shape, gpu_allocator); GenericTensorAccessorW output_accessor_gpu = gpu_allocator.allocate_tensor(output_shape); Kernels::Combine::forward_kernel( managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); - std::vector result_data_gpu = - load_accessor_data(output_accessor_gpu); - // Run CPU Combine Forward Kernel GenericTensorAccessorR input_accessor_cpu = copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); @@ -80,42 +70,32 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Combine::cpu_forward_kernel(input_accessor_cpu, output_accessor_cpu); - std::vector result_data_cpu = - load_accessor_data(output_accessor_cpu); - - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); + CHECK(w_accessors_are_equal(output_accessor_gpu, + output_accessor_cpu)); } SUBCASE("backward_kernel") { // Run GPU Combine Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = - create_random_filled_accessor_r(output_shape, - gpu_allocator); + create_random_filled_accessor_r(output_shape, gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor(input_shape); - fill_with_zeros(input_grad_accessor_gpu); + create_zero_filled_accessor_w(input_shape, gpu_allocator); Kernels::Combine::backward_kernel(managed_stream.raw_stream(), output_grad_accessor_gpu, input_grad_accessor_gpu); - std::vector result_data_gpu = - load_accessor_data(input_grad_accessor_gpu); - // Run CPU Combine Backward Kernel GenericTensorAccessorR output_grad_accessor_cpu = copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor(input_shape); - fill_with_zeros(input_grad_accessor_cpu); + create_zero_filled_accessor_w(input_shape, cpu_allocator); Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu, input_grad_accessor_cpu); - std::vector result_data_cpu = - load_accessor_data(input_grad_accessor_cpu); - - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); + CHECK(w_accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index a2becc3a54..b30995cf15 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -23,8 +23,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { std::vector input_accessors = repeat(num_inputs, [&]() { - return create_random_filled_accessor_r(input_shape, - allocator); + return create_random_filled_accessor_r(input_shape, allocator); }); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -34,16 +33,12 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessors, concat_axis); - std::vector host_output_data = - load_accessor_data(output_accessor); - - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - create_random_filled_accessor_r(output_shape, - allocator); + create_random_filled_accessor_r(output_shape, allocator); std::vector input_grad_accessors = repeat( num_inputs, [&]() { return allocator.allocate_tensor(input_shape); }); diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index d32eae916a..2c7e2657f7 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -1,6 +1,7 @@ #include "doctest/doctest.h" #include "kernels/dropout_kernels.h" #include "test_utils.h" +#include "utils/containers/count.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { @@ -30,8 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - create_random_filled_accessor_r(input_shape, - allocator); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -40,19 +40,14 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_accessor = - load_accessor_data(output_accessor); - - CHECK(contains_non_zero(host_output_accessor)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_data = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_data = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Dropout::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index c4d6aa666a..3a3e3b28b7 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -16,7 +16,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 2.0f)); + create_filled_accessor_w(input_shape, allocator, 2.0f)); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -26,32 +26,21 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor, output_accessor.get_float_ptr()); - std::vector check_output_data = - load_accessor_data(output_accessor); - - std::vector expected_output_data( - input_accessor.shape.num_elements(), 2.0f); - CHECK(vectors_are_approx_equal(check_output_data, expected_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 0.0f); + create_filled_accessor_w(output_shape, allocator, 0.0f); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); + create_filled_accessor_w(input_shape, allocator, 1.0f); Kernels::Flat::backward_kernel(managed_stream.raw_stream(), input_accessor, input_grad_accessor.get_float_ptr(), output_grad_accessor.get_float_ptr()); - std::vector backward_output_data = - load_accessor_data(input_grad_accessor); - - std::vector expected_output_data( - input_accessor.shape.num_elements(), 1.0f); - CHECK( - vectors_are_approx_equal(backward_output_data, expected_output_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index b8c4da0df2..fd7a8ab47a 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -18,13 +18,11 @@ TEST_SUITE(FF_TEST_SUITE) { make_tensor_shape_from_legion_dims({50}, DataType::FLOAT); GenericTensorAccessorR index_accessor = - create_random_filled_accessor_r(output_shape, - allocator); + create_random_filled_accessor_r(output_shape, allocator); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - create_random_filled_accessor_r(input_shape, - allocator); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -34,18 +32,14 @@ TEST_SUITE(FF_TEST_SUITE) { index_accessor, output_accessor); - std::vector host_output_data = - load_accessor_data(output_accessor); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - create_random_filled_accessor_r(output_shape, - allocator); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Gather::backward_kernel(managed_stream.raw_stream(), state, @@ -53,9 +47,7 @@ TEST_SUITE(FF_TEST_SUITE) { index_accessor, input_grad_accessor); - std::vector host_input_grad_data = - load_accessor_data(input_grad_accessor); - CHECK(contains_non_zero(host_input_grad_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 651959d171..b667716181 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -31,16 +31,15 @@ TEST_SUITE(FF_TEST_SUITE) { epsilon); GenericTensorAccessorR input_accessor = - create_random_filled_accessor_r(input_shape, - allocator); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW gamma_accessor = - create_filled_accessor_w(feature_shape, allocator, 1.0f); + create_filled_accessor_w(feature_shape, allocator, 1.0f); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); GenericTensorAccessorW beta_accessor = - create_filled_accessor_w(feature_shape, allocator, 0.0f); + create_filled_accessor_w(feature_shape, allocator, 0.0f); Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(), state, @@ -52,11 +51,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - create_random_filled_accessor_r(output_shape, - allocator); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW gamma_grad_accessor = allocator.allocate_tensor(feature_shape); GenericTensorAccessorW beta_grad_accessor = diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 21c970e175..7110128885 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -20,42 +20,28 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_filled_accessor_r(input_shape, allocator, 1.0f); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Repartition::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector check_output_data = - load_accessor_data(output_accessor); - - std::vector expected_output_data( - input_accessor.shape.num_elements(), 1.0f); - - CHECK(vectors_are_approx_equal(check_output_data, expected_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_filled_accessor_r(output_shape, allocator, 1.0f); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + create_filled_accessor_w(input_shape, allocator, 2.0f); Kernels::Repartition::backward_kernel(managed_stream.raw_stream(), state, input_grad_accessor, output_grad_accessor); - std::vector host_grad_input_data = - load_accessor_data(input_grad_accessor); - - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements(), 3.0f); - CHECK(vectors_are_approx_equal(host_grad_input_data, - expected_grad_input_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index e014accfd3..52a177dd72 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -42,11 +42,9 @@ TEST_SUITE(FF_TEST_SUITE) { {output_w, output_h, output_c, output_n}, DataType::FLOAT); GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { Kernels::Pool2D::forward_kernel(managed_stream.raw_stream(), @@ -54,14 +52,12 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.ptr, output_accessor.ptr); - std::vector host_output_data = - load_accessor_data(output_accessor); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + create_filled_accessor_w(output_shape, allocator, 1.0f); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -72,9 +68,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor.ptr, output_grad_accessor.ptr); - std::vector host_input_grad = - load_accessor_data(input_grad_accessor); - CHECK(contains_non_zero(host_input_grad)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 999044beb8..d727e267fe 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -20,8 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) { make_tensor_shape_from_legion_dims({10}, DataType::FLOAT); GenericTensorAccessorR input_accessor = - create_random_filled_accessor_r(input_shape, - allocator); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -30,17 +29,14 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor, num_replicas); - std::vector host_output_data = - load_accessor_data(output_accessor); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { TensorShape output_shape = input_shape; GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_filled_accessor_r(output_shape, allocator, 1.0f); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -48,11 +44,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_grad_accessor, output_grad_accessor); - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements(), 1.0f); - std::vector host_grad_data = - load_accessor_data(input_grad_accessor); - CHECK(vectors_are_approx_equal(host_grad_data, expected_grad_input_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index dae69c0262..77f4001328 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -20,41 +20,32 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Replicate::forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor); - std::vector check_output_data = - load_accessor_data(output_accessor); - - std::vector expected_output_data( - input_accessor.shape.num_elements(), 1.0f); - CHECK(vectors_are_approx_equal(check_output_data, expected_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_random_filled_accessor_r(output_shape, allocator); + GenericTensorAccessorW input_grad_accessor = + allocator.allocate_tensor(input_shape); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor, output_grad_accessor, + input_grad_accessor, num_replicas); - std::vector check_aggregated_data = - load_accessor_data(input_grad_accessor); - CHECK(contains_non_zero(check_aggregated_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } - TEST_CASE("Check Replicate Forward Kernel against CPU Kernel") { + TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") { std::size_t num_replicas = 2; TensorShape input_shape = @@ -71,66 +62,49 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { // Run GPU Replicate Forward Kernel GenericTensorAccessorR input_accessor_gpu = - create_random_filled_accessor_r(input_shape, - gpu_allocator); + create_random_filled_accessor_r(input_shape, gpu_allocator); GenericTensorAccessorW output_accessor_gpu = - gpu_allocator.allocate_tensor(output_shape); - fill_with_zeros(output_accessor_gpu); + create_zero_filled_accessor_w(output_shape, gpu_allocator); Kernels::Replicate::forward_kernel( managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); - std::vector result_data_gpu = - load_accessor_data(output_accessor_gpu); - // Run CPU Replicate Forward Kernel GenericTensorAccessorR input_accessor_cpu = copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); GenericTensorAccessorW output_accessor_cpu = - cpu_allocator.allocate_tensor(output_shape); - fill_with_zeros(output_accessor_cpu); + create_zero_filled_accessor_w(output_shape, cpu_allocator); Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, output_accessor_cpu); - std::vector result_data_cpu = - load_accessor_data(output_accessor_cpu); - - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); + CHECK(w_accessors_are_equal(output_accessor_gpu, + output_accessor_cpu)); } SUBCASE("backward_kernel") { // Run GPU Replicate Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = - create_random_filled_accessor_r(output_shape, - gpu_allocator); + create_random_filled_accessor_r(output_shape, gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor(input_shape); - fill_with_zeros(input_grad_accessor_gpu); + create_zero_filled_accessor_w(input_shape, gpu_allocator); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor_gpu, output_grad_accessor_gpu, + input_grad_accessor_gpu, num_replicas); - std::vector result_data_gpu = - load_accessor_data(input_grad_accessor_gpu); - // Run CPU Replicate Backward Kernel GenericTensorAccessorR output_grad_accessor_cpu = copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); - GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor(input_shape); - fill_with_zeros(input_grad_accessor_cpu); + create_zero_filled_accessor_w(input_shape, cpu_allocator); Kernels::Replicate::cpu_backward_kernel( - input_grad_accessor_cpu, output_grad_accessor_cpu, num_replicas); - - std::vector result_data_cpu = - load_accessor_data(input_grad_accessor_cpu); + output_grad_accessor_cpu, input_grad_accessor_cpu, num_replicas); - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); + CHECK(w_accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index 016e7b490a..92a61524a3 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -19,41 +19,28 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Reshape::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector check_output_data = - load_accessor_data(output_accessor); - - std::vector expected_output_data( - input_accessor.shape.num_elements(), 1.0f); - CHECK(vectors_are_approx_equal(check_output_data, expected_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + allocator.allocate_tensor(input_shape); Kernels::Reshape::backward_kernel(managed_stream.raw_stream(), state, input_grad_accessor, output_grad_accessor); - std::vector host_grad_input_data = - load_accessor_data(input_grad_accessor); - - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements(), 3.0f); - CHECK(vectors_are_approx_equal(host_grad_input_data, - expected_grad_input_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 94e6f139ff..4e98ea701b 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -22,7 +22,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_filled_accessor_w(input_shape, allocator, 1.0f)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -34,16 +34,12 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_accessor.shape.num_elements()); - std::vector check_output_data = - load_accessor_data(output_accessor); - - CHECK(contains_non_zero(check_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -56,17 +52,14 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_grad_accessor.shape.num_elements()); - std::vector host_grad_input_data = - load_accessor_data(input_grad_accessor); - - CHECK(contains_non_zero(host_grad_input_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { - std::size_t num_out_blks = 2; + std::size_t num_out_blks = 1; std::size_t reverse_dim_size = 3; - std::size_t in_blk_size = 5; + std::size_t in_blk_size = 2; TensorShape input_shape = make_tensor_shape_from_legion_dims( {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); @@ -85,11 +78,9 @@ TEST_SUITE(FF_TEST_SUITE) { // Run GPU Cast Forward Kernel GenericTensorAccessorR input_accessor_gpu = - create_random_filled_accessor_r(input_shape, - gpu_allocator); + create_random_filled_accessor_r(input_shape, gpu_allocator); GenericTensorAccessorW output_accessor_gpu = - gpu_allocator.allocate_tensor(output_shape); - fill_with_zeros(output_accessor_gpu); + create_zero_filled_accessor_w(output_shape, gpu_allocator); Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), input_accessor_gpu.get_float_ptr(), @@ -99,36 +90,25 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_accessor_gpu.shape.num_elements()); - std::vector result_data_gpu = - load_accessor_data(output_accessor_gpu); - // Run CPU Cast Forward Kernel GenericTensorAccessorR input_accessor_cpu = copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); GenericTensorAccessorW output_accessor_cpu = - cpu_allocator.allocate_tensor(output_shape); - fill_with_zeros(output_accessor_cpu); + create_zero_filled_accessor_w(output_shape, cpu_allocator); Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu, - output_accessor_cpu, - num_out_blks, - reverse_dim_size, - in_blk_size); + output_accessor_cpu); - std::vector result_data_cpu = - load_accessor_data(output_accessor_cpu); - - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); + CHECK(w_accessors_are_equal(output_accessor_cpu, + output_accessor_cpu)); } SUBCASE("backward_kernel") { // Run GPU Cast Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = - create_random_filled_accessor_r(output_shape, - gpu_allocator); + create_random_filled_accessor_r(output_shape, gpu_allocator); GenericTensorAccessorW input_grad_accessor_gpu = - gpu_allocator.allocate_tensor(input_shape); - fill_with_zeros(input_grad_accessor_gpu); + create_zero_filled_accessor_w(input_shape, gpu_allocator); Kernels::Reverse::backward_kernel( managed_stream.raw_stream(), @@ -139,26 +119,17 @@ TEST_SUITE(FF_TEST_SUITE) { in_blk_size, input_grad_accessor_gpu.shape.num_elements()); - std::vector result_data_gpu = - load_accessor_data(input_grad_accessor_gpu); - // Run CPU Cast Backward Kernel GenericTensorAccessorR output_grad_accessor_cpu = copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); GenericTensorAccessorW input_grad_accessor_cpu = - cpu_allocator.allocate_tensor(input_shape); - fill_with_zeros(input_grad_accessor_cpu); + create_zero_filled_accessor_w(input_shape, cpu_allocator); Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu, - input_grad_accessor_cpu, - num_out_blks, - reverse_dim_size, - in_blk_size); - - std::vector result_data_cpu = - load_accessor_data(input_grad_accessor_cpu); + input_grad_accessor_cpu); - CHECK(vectors_are_approx_equal(result_data_gpu, result_data_cpu)); + CHECK(w_accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index e4f73d4747..f723a9ca46 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -21,27 +21,23 @@ TEST_SUITE(FF_TEST_SUITE) { managed_handle.raw_handle(), 0, input_n, channels, input_h, input_w); GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Softmax::forward_kernel(managed_stream.raw_stream(), state, input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_data = - load_accessor_data(output_accessor); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -51,12 +47,7 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor.get_float_ptr(), output_grad_accessor.shape.num_elements()); - std::vector expected_input_grad_data = - std::vector(input_grad_accessor.shape.num_elements(), 1.0f); - std::vector host_input_grad_data = - load_accessor_data(input_grad_accessor); - CHECK(vectors_are_approx_equal(host_input_grad_data, - expected_input_grad_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index 649f188e9e..a3cf215dff 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -24,8 +24,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); std::vector output_ptrs = repeat(num_outputs, [&]() { GenericTensorAccessorW output_accessor = @@ -46,13 +45,12 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector output_grad_ptrs(num_outputs); for (int i = 0; i < num_outputs; i++) { GenericTensorAccessorW output_grad_accessor = - create_random_filled_accessor_w(output_shape, - allocator); + create_random_filled_accessor_w(output_shape, allocator); output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 0.0f); + create_filled_accessor_w(input_shape, allocator, 0.0f); Kernels::Split::backward_kernel(managed_stream.raw_stream(), input_grad_accessor.get_float_ptr(), diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index 2abbd66c8f..d5d0b00576 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -23,35 +23,28 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - create_random_filled_accessor_r(input_shape, - allocator); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Transpose::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector host_output_data = - load_accessor_data(output_accessor); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - create_random_filled_accessor_r(output_shape, - allocator); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, - allocator); + create_random_filled_accessor_w(input_shape, allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), state, input_grad_accessor, output_grad_accessor); - std::vector host_grad_input_data = - load_accessor_data(input_grad_accessor); - CHECK(contains_non_zero(host_grad_input_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index 02421f9bc5..103c866c10 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -1,13 +1,14 @@ #include "test_utils.h" +#include "op-attrs/tensor_shape.h" +#include namespace FlexFlow { -bool device_on_cpu(DeviceType device_type) { - return device_type == DeviceType::CPU; -} - -bool device_on_gpu(DeviceType device_type) { - return device_type == DeviceType::GPU; +GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape); + fill_with_zeros(result_accessor); + return result_accessor; } TensorShape @@ -22,47 +23,54 @@ TensorShape } template -struct CopyTensorAccessorW { - GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, +struct CreateRandomFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, Allocator &allocator) { - TensorShape shape = - get_tensor_shape(src_accessor.shape, src_accessor.data_type); - GenericTensorAccessorW copied_tensor = allocator.allocate_tensor(shape); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); + + using T = real_type_t
; + T *data_ptr = src_accessor.get
(); + + std::random_device rd; + std::mt19937 gen(rd()); + size_t num_elements = get_num_elements(shape); + if constexpr (std::is_same::value) { + std::bernoulli_distribution dist(0.5); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } else if constexpr (std::is_floating_point::value) { + std::uniform_real_distribution dist(-1.0, 1.0); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } else if constexpr (std::is_integral::value) { + std::uniform_int_distribution dist(0, 100); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } - transfer_memory( - copied_tensor, src_accessor.get
(), src_accessor.device_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + transfer_data_between_accessors(dst_accessor, src_accessor); - return copied_tensor; + return dst_accessor; } }; -GenericTensorAccessorW - copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, - Allocator &allocator) { - return DataTypeDispatch1{}( - src_accessor.data_type, src_accessor, std::ref(allocator)); +GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, + Allocator &allocator) { + return DataTypeDispatch1{}( + shape.data_type, shape, std::ref(allocator)); } -template -struct CopyTensorAccessorR { - GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor, - Allocator &allocator) { - TensorShape shape = - get_tensor_shape(src_accessor.shape, src_accessor.data_type); - GenericTensorAccessorW copied_tensor = allocator.allocate_tensor(shape); - - transfer_memory( - copied_tensor, src_accessor.get
(), src_accessor.device_type); - - return read_only_accessor_from_write_accessor(copied_tensor); - } -}; +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_random_filled_accessor_w(shape, allocator); -GenericTensorAccessorR - copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, - Allocator &allocator) { - return DataTypeDispatch1{}( - src_accessor.data_type, src_accessor, std::ref(allocator)); + return read_only_accessor_from_write_accessor(accessor); } template @@ -83,4 +91,81 @@ void fill_with_zeros(GenericTensorAccessorW const &accessor) { DataTypeDispatch1{}(accessor.data_type, accessor); } +template +struct CPUAccessorRContainsNonZero { + bool operator()(GenericTensorAccessorR const &accessor) { + using T = real_type_t
; + + T const *data_ptr = accessor.get
(); + + for (size_t i = 0; i < accessor.shape.num_elements(); i++) { + if (data_ptr[i] != 0) { + return true; + } + } + + return false; + } +}; + +bool contains_non_zero(GenericTensorAccessorR const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + create_cpu_compatible_accessor_r(accessor, cpu_allocator); + return DataTypeDispatch1{}( + cpu_accessor.data_type, cpu_accessor); +} + +bool contains_non_zero(GenericTensorAccessorW const &accessor) { + GenericTensorAccessorR r_accessor = + read_only_accessor_from_write_accessor(accessor); + return contains_non_zero(r_accessor); +} + +GenericTensorAccessorR + create_cpu_compatible_accessor_r(GenericTensorAccessorR const &accessor, + Allocator &cpu_allocator) { + GenericTensorAccessorR cpu_accessor = accessor; + if (accessor.device_type == DeviceType::GPU) { + cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator); + } + return cpu_accessor; +} + +GenericTensorAccessorW + create_cpu_compatible_accessor_w(GenericTensorAccessorW const &accessor, + Allocator &cpu_allocator) { + GenericTensorAccessorW cpu_accessor = accessor; + if (accessor.device_type == DeviceType::GPU) { + cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator); + } + return cpu_accessor; +} + +template +struct PrintCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor) { + using T = real_type_t
; + + T const *data_ptr = accessor.get
(); + for (size_t i = 0; i < accessor.shape.num_elements(); i++) { + std::cout << data_ptr[i] << " "; + } + std::cout << "\n"; + } +}; + +void print_accessor(GenericTensorAccessorR const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + create_cpu_compatible_accessor_r(accessor, cpu_allocator); + DataTypeDispatch1{}(accessor.data_type, accessor); +} + +void print_accessor(GenericTensorAccessorW const &accessor) { + GenericTensorAccessorR r_accessor = + read_only_accessor_from_write_accessor(accessor); + print_accessor(r_accessor); +} + } // namespace FlexFlow diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index 22dda0029a..4de114bd48 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -8,152 +8,117 @@ #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" #include "op-attrs/datatype.h" -#include "utils/containers/all_of.h" -#include namespace FlexFlow { -bool device_on_cpu(DeviceType); -bool device_on_gpu(DeviceType); +GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, + Allocator &allocator); -GenericTensorAccessorR - copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, - Allocator &allocator); +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator); -GenericTensorAccessorW - copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, - Allocator &allocator); +GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, + Allocator &allocator); TensorShape make_tensor_shape_from_legion_dims(LegionOrdered const &dims, DataType DT); -void fill_with_zeros(GenericTensorAccessorW const &accessor); +bool contains_non_zero(GenericTensorAccessorW const &accessor); -template -void transfer_memory(GenericTensorAccessorW dst_accessor, - const DT *src, - DeviceType src_device_type) { - size_t bytes = dst_accessor.shape.get_volume() * sizeof(DT); - - DeviceType dst_device_type = dst_accessor.device_type; - - if (device_on_cpu(src_device_type) && device_on_cpu(dst_device_type)) { - memcpy(dst_accessor.ptr, src, bytes); - } else if (device_on_cpu(src_device_type) && device_on_gpu(dst_device_type)) { - checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyHostToDevice)); - } else if (device_on_gpu(src_device_type) && device_on_cpu(dst_device_type)) { - checkCUDA(cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToHost)); - } else { - checkCUDA( - cudaMemcpy(dst_accessor.ptr, src, bytes, cudaMemcpyDeviceToDevice)); - } -} +bool contains_non_zero(GenericTensorAccessorR const &accessor); -template -GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator) { - assert(shape.data_type == DataType::FLOAT || - shape.data_type == DataType::DOUBLE); +void fill_with_zeros(GenericTensorAccessorW const &accessor); - using T = real_type_t
; +GenericTensorAccessorW + create_cpu_compatible_accessor_w(GenericTensorAccessorW const &accessor, + Allocator &allocator); - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); +GenericTensorAccessorR + create_cpu_compatible_accessor_r(GenericTensorAccessorR const &accessor, + Allocator &allocator); - std::vector host_data(accessor.shape.num_elements()); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dist(-1.0, 1.0); +void print_accessor(GenericTensorAccessorR const &accessor); - for (auto &val : host_data) { - val = dist(gen); - } +void print_accessor(GenericTensorAccessorW const &accessor); - transfer_memory(accessor, host_data.data(), DeviceType::CPU); +template +struct CreateFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, + Allocator &allocator, + real_type_t
val) { + using T = real_type_t
; - return accessor; -} + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); -template -GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, - Allocator &allocator) { - using T = real_type_t
; - GenericTensorAccessorW accessor = - create_random_filled_accessor_w
(shape, allocator); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); - return read_only_accessor_from_write_accessor(accessor); -} + T *data_ptr = src_accessor.get
(); + for (size_t i = 0; i < dst_accessor.shape.num_elements(); i++) { + data_ptr[i] = val; + } + + transfer_data_between_accessors(dst_accessor, src_accessor); + return dst_accessor; + } +}; template GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, Allocator &allocator, T val) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - - size_t volume = accessor.shape.get_volume(); - std::vector host_data(volume, val); - - transfer_memory(accessor, host_data.data(), DeviceType::CPU); + return DataTypeDispatch1{}( + shape.data_type, shape, std::ref(allocator), val); +} - return accessor; +template +GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, + Allocator &allocator, + T val) { + GenericTensorAccessorW w_accessor = + create_filled_accessor_w(shape, allocator, val); + return read_only_accessor_from_write_accessor(w_accessor); } template -std::vector> - load_accessor_data(GenericTensorAccessorR accessor) { - using T = real_type_t
; +bool w_accessors_are_equal(GenericTensorAccessorW const &accessor_a, + GenericTensorAccessorW const &accessor_b) { + if (accessor_a.shape.num_dims() != accessor_b.shape.num_dims()) { + throw mk_runtime_error( + "Comparing equivalence for two accessors of differing dimensions"); + } + for (size_t i = 0; i < accessor_a.shape.num_dims(); i++) { + if (accessor_a.shape[legion_dim_t(i)] != + accessor_b.shape[legion_dim_t(i)]) { + throw mk_runtime_error( + "Comparing equivalence for two accessors of differing shape"); + } + } - int volume = accessor.shape.get_volume(); - std::vector local_data(volume); - T const *src_ptr = accessor.get
(); - - if (device_on_cpu(accessor.device_type)) { - memcpy(local_data.data(), src_ptr, volume * sizeof(T)); - } else { - checkCUDA(cudaMemcpy(local_data.data(), - src_ptr, - volume * sizeof(T), - cudaMemcpyDeviceToHost)); + if (accessor_a.data_type != accessor_b.data_type) { + return false; } - return local_data; -} + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor_a = + create_cpu_compatible_accessor_w(accessor_a, cpu_allocator); + GenericTensorAccessorW cpu_accessor_b = + create_cpu_compatible_accessor_w(accessor_b, cpu_allocator); -template -std::vector> - load_accessor_data(GenericTensorAccessorW accessor) { using T = real_type_t
; - - int volume = accessor.shape.get_volume(); - std::vector local_data(volume); - T const *src_ptr = accessor.get
(); - - if (device_on_cpu(accessor.device_type)) { - memcpy(local_data.data(), src_ptr, volume * sizeof(T)); - } else { - checkCUDA(cudaMemcpy(local_data.data(), - src_ptr, - volume * sizeof(T), - cudaMemcpyDeviceToHost)); + T *a_data_ptr = cpu_accessor_a.get
(); + T *b_data_ptr = cpu_accessor_b.get
(); + + for (size_t i = 0; i < accessor_a.shape.num_elements(); i++) { + if (a_data_ptr[i] != b_data_ptr[i]) { + print_accessor(cpu_accessor_a); + print_accessor(cpu_accessor_b); + return false; + } } - return local_data; -} - -template -bool contains_non_zero(std::vector &data) { - return !all_of(data, [](T const &val) { return val == 0; }); -} - -template -bool vectors_are_approx_equal(T lhs, T rhs) { - float epsilon = 0.0001f; - return std::equal( - lhs.begin(), - lhs.end(), - rhs.begin(), - rhs.end(), - [epsilon](float a, float b) { return std::abs(a - b) < epsilon; }); + return true; } } // namespace FlexFlow diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc index f61ed7bc7b..5d099c6b46 100644 --- a/lib/local-execution/src/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local_task_argument_accessor.cc @@ -24,11 +24,8 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( auto tensor_backing = std::get( this->tensor_slots_backing.at(slot_grad_pair)); if (priv == Permissions::RO) { - GenericTensorAccessorR readonly_tensor_backing = { - tensor_backing.data_type, - tensor_backing.shape, - tensor_backing.ptr, - this->allocator.get_allocation_device_type()}; + GenericTensorAccessorR readonly_tensor_backing = + read_only_accessor_from_write_accessor(tensor_backing); return readonly_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { return tensor_backing; @@ -47,10 +44,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( for (GenericTensorAccessorW const &tensor_backing : variadic_tensor_backing) { readonly_variadic_tensor_backing.push_back( - {tensor_backing.data_type, - tensor_backing.shape, - tensor_backing.ptr, - this->allocator.get_allocation_device_type()}); + read_only_accessor_from_write_accessor(tensor_backing)); } return readonly_variadic_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc index 135475a711..56bbfdd371 100644 --- a/lib/local-execution/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -67,8 +67,8 @@ static std::optional return profile(backward_kernel, profiling, "[replicate] backward_time = {:.2lf}ms\n", - input_grad, output_grad, + input_grad, attrs.replicate_degree); } From e869ace02b8aaf00feae775023dae755b14e43aa Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 14 Oct 2024 23:41:17 -0700 Subject: [PATCH 12/20] formatting --- lib/kernels/test/src/test_reduction_kernel.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index d727e267fe..8706c5d877 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -36,7 +36,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = input_shape; GenericTensorAccessorR output_grad_accessor = - create_filled_accessor_r(output_shape, allocator, 1.0f); + create_filled_accessor_r(output_shape, allocator, 1.0f); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); From de230cb86911207070708f7e23086bb0ac01b49d Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 14 Oct 2024 23:55:20 -0700 Subject: [PATCH 13/20] comment removal reverse_kernels --- lib/kernels/src/cuda/ops/reverse_kernels.cu | 48 +++------------------ 1 file changed, 6 insertions(+), 42 deletions(-) diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index 8e93fec0d6..2c25293c36 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -17,44 +17,9 @@ #include "kernels/reverse_kernels.h" namespace FlexFlow { - namespace Kernels { namespace Reverse { -// __global__ void reverse_forward_kernel(float const *in_ptr, -// float *out_ptr, -// coord_t num_out_blks, -// coord_t reverse_dim_size, -// coord_t in_blk_size) { -// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { -// coord_t out_idx = i; -// coord_t blk_idx = i / (reverse_dim_size * in_blk_size); -// i = i - blk_idx * (reverse_dim_size * in_blk_size); -// coord_t reverse_dim_idx = i / in_blk_size; -// i = i - reverse_dim_idx * in_blk_size; -// coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + -// (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + -// i; -// out_ptr[out_idx] = in_ptr[in_idx]; -// } -// CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { -// coord_t blk_idx = i / (reverse_dim_size * in_blk_size); -// i = i - blk_idx * (reverse_dim_size * in_blk_size); -// coord_t reverse_dim_idx = i / in_blk_size; -// i = i - reverse_dim_idx * in_blk_size; -// coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + -// (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + -// i; -// out_ptr[i] = in_ptr[in_idx]; -// } -// } - -/* I mentioned this earlier, but I still think the reverse_forward_kernel code - is incorrect, even though it matches the code in inference/master? Whenever - I'm testing the code and printing out the output, I'm getting unexpected - outputs, and I think it's a result of modifying the loop index i in the - previous code? -*/ __global__ void reverse_forward_kernel(float const *in_ptr, float *out_ptr, coord_t num_out_blks, @@ -62,13 +27,12 @@ __global__ void reverse_forward_kernel(float const *in_ptr, coord_t in_blk_size) { CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { coord_t blk_idx = i / (reverse_dim_size * in_blk_size); - coord_t idx_within_blk = i % (reverse_dim_size * in_blk_size); - coord_t reverse_dim_idx = idx_within_blk / in_blk_size; - coord_t in_idx = idx_within_blk % in_blk_size; - coord_t input_index = - blk_idx * (reverse_dim_size * in_blk_size) + - (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + in_idx; - out_ptr[i] = in_ptr[input_index]; + i = i - blk_idx * (reverse_dim_size * in_blk_size); + coord_t reverse_dim_idx = i / in_blk_size; + i = i - reverse_dim_idx * in_blk_size; + coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + + (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i; + out_ptr[i] = in_ptr[in_idx]; } } From 3fc8718b3573fab3285a69913ab2985aaeb9bd4c Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 15 Oct 2024 19:22:55 -0700 Subject: [PATCH 14/20] Issue #1435, tests for managed stream and handle --- lib/kernels/src/managed_ff_stream.cc | 19 +++++++---- .../src/managed_per_device_ff_handle.cc | 33 +++++++++++------- .../test/src/test_managed_ff_stream.cc | 29 ++++++++++++++++ .../src/test_managed_per_device_ff_handle.cc | 34 +++++++++++++++++++ 4 files changed, 97 insertions(+), 18 deletions(-) create mode 100644 lib/kernels/test/src/test_managed_ff_stream.cc create mode 100644 lib/kernels/test/src/test_managed_per_device_ff_handle.cc diff --git a/lib/kernels/src/managed_ff_stream.cc b/lib/kernels/src/managed_ff_stream.cc index 7385b6cc3e..a8b44dc1d3 100644 --- a/lib/kernels/src/managed_ff_stream.cc +++ b/lib/kernels/src/managed_ff_stream.cc @@ -1,28 +1,35 @@ #include "kernels/managed_ff_stream.h" +#include "utils/exception.h" namespace FlexFlow { ManagedFFStream::ManagedFFStream() : stream(new ffStream_t) { - checkCUDA(cudaStreamCreate(stream)); + checkCUDA(cudaStreamCreate(this->stream)); } ManagedFFStream::ManagedFFStream(ManagedFFStream &&other) noexcept : stream(std::exchange(other.stream, nullptr)) {} ManagedFFStream &ManagedFFStream::operator=(ManagedFFStream &&other) noexcept { - std::swap(this->stream, other.stream); + if (this != &other) { + if (this->stream != nullptr) { + checkCUDA(cudaStreamDestroy(*this->stream)); + delete stream; + } + this->stream = std::exchange(other.stream, nullptr); + } return *this; } ManagedFFStream::~ManagedFFStream() { - if (stream != nullptr) { - checkCUDA(cudaStreamDestroy(*stream)); - delete stream; + if (this->stream != nullptr) { + checkCUDA(cudaStreamDestroy(*this->stream)); + delete this->stream; } } ffStream_t const &ManagedFFStream::raw_stream() const { - return *stream; + return *this->stream; } } // namespace FlexFlow diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index c050e887b6..ca105f9bc9 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -4,13 +4,13 @@ namespace FlexFlow { ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() { - handle = new PerDeviceFFHandle; - handle->workSpaceSize = 1024 * 1024; - handle->allowTensorOpMathConversion = true; + this->handle = new PerDeviceFFHandle; + this->handle->workSpaceSize = 1024 * 1024; + this->handle->allowTensorOpMathConversion = true; - checkCUDNN(cudnnCreate(&handle->dnn)); - checkCUBLAS(cublasCreate(&handle->blas)); - checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize)); + checkCUDNN(cudnnCreate(&this->handle->dnn)); + checkCUBLAS(cublasCreate(&this->handle->blas)); + checkCUDA(cudaMalloc(&this->handle->workSpace, this->handle->workSpaceSize)); } ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( @@ -19,16 +19,25 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( ManagedPerDeviceFFHandle &ManagedPerDeviceFFHandle::operator=( ManagedPerDeviceFFHandle &&other) noexcept { - std::swap(this->handle, other.handle); + if (this != &other) { + if (this->handle != nullptr) { + checkCUDNN(cudnnDestroy(this->handle->dnn)); + checkCUBLAS(cublasDestroy(this->handle->blas)); + checkCUDA(cudaFree(this->handle->workSpace)); + delete this->handle; + } + this->handle = std::exchange(other.handle, nullptr); + } return *this; } ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() { - if (handle != nullptr) { - checkCUDNN(cudnnDestroy(handle->dnn)); - checkCUBLAS(cublasDestroy(handle->blas)); - checkCUDA(cudaFree(handle->workSpace)); - delete handle; + if (this->handle != nullptr) { + checkCUDNN(cudnnDestroy(this->handle->dnn)); + checkCUBLAS(cublasDestroy(this->handle->blas)); + checkCUDA(cudaFree(this->handle->workSpace)); + delete this->handle; + this->handle = nullptr; } } diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc new file mode 100644 index 0000000000..1dc40f0a92 --- /dev/null +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -0,0 +1,29 @@ +#include "doctest/doctest.h" +#include "kernels/managed_ff_stream.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test Managed FF Stream") { + ManagedFFStream base_stream{}; + + SUBCASE("Test ManagedFFStream Move Constructor") { + ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); + + ManagedFFStream new_stream(std::move(base_stream)); + + CHECK(&base_stream.raw_stream() == nullptr); + CHECK(&new_stream.raw_stream() == base_stream_ptr); + } + + SUBCASE("Test ManagedFFStream Assignment Operator") { + ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); + + ManagedFFStream new_stream{}; + new_stream = std::move(base_stream); + + CHECK(&base_stream.raw_stream() == nullptr); + CHECK(&new_stream.raw_stream() == base_stream_ptr); + } + } +} diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc new file mode 100644 index 0000000000..d99d375a7c --- /dev/null +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -0,0 +1,34 @@ +#include "doctest/doctest.h" +#include "kernels/managed_per_device_ff_handle.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Test Managed Per Device FF Handle") { + ManagedPerDeviceFFHandle base_handle{}; + + SUBCASE("Test ManagedPerDeviceFFHandle Constructor") { + CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024); + CHECK(base_handle.raw_handle().allowTensorOpMathConversion == true); + } + + SUBCASE("Test ManagedPerDeviceFFHandle Move Constructor") { + PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); + + ManagedPerDeviceFFHandle new_handle(std::move(base_handle)); + + CHECK(&base_handle.raw_handle() == nullptr); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } + + SUBCASE("Test ManagedPerDeviceFFHandle Assignment Operator") { + PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); + + ManagedPerDeviceFFHandle new_handle{}; + new_handle = std::move(base_handle); + + CHECK(&base_handle.raw_handle() == nullptr); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } + } +} From d1c9e90f9520fbd6f1d59d4766f257ddec075792 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 15 Oct 2024 19:25:18 -0700 Subject: [PATCH 15/20] #1435 formatting --- lib/kernels/test/src/test_managed_ff_stream.cc | 6 +++--- lib/kernels/test/src/test_managed_per_device_ff_handle.cc | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc index 1dc40f0a92..1dedb0c41d 100644 --- a/lib/kernels/test/src/test_managed_ff_stream.cc +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -5,11 +5,11 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Managed FF Stream") { - ManagedFFStream base_stream{}; + ManagedFFStream base_stream{}; SUBCASE("Test ManagedFFStream Move Constructor") { ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); - + ManagedFFStream new_stream(std::move(base_stream)); CHECK(&base_stream.raw_stream() == nullptr); @@ -21,7 +21,7 @@ TEST_SUITE(FF_TEST_SUITE) { ManagedFFStream new_stream{}; new_stream = std::move(base_stream); - + CHECK(&base_stream.raw_stream() == nullptr); CHECK(&new_stream.raw_stream() == base_stream_ptr); } diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc index d99d375a7c..e85cfd61c7 100644 --- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -5,7 +5,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Managed Per Device FF Handle") { - ManagedPerDeviceFFHandle base_handle{}; + ManagedPerDeviceFFHandle base_handle{}; SUBCASE("Test ManagedPerDeviceFFHandle Constructor") { CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024); @@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Test ManagedPerDeviceFFHandle Move Constructor") { PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); - + ManagedPerDeviceFFHandle new_handle(std::move(base_handle)); CHECK(&base_handle.raw_handle() == nullptr); @@ -26,7 +26,7 @@ TEST_SUITE(FF_TEST_SUITE) { ManagedPerDeviceFFHandle new_handle{}; new_handle = std::move(base_handle); - + CHECK(&base_handle.raw_handle() == nullptr); CHECK(&new_handle.raw_handle() == base_handle_ptr); } From 7106dec57901c3e580ed130589fbe0965b5db3da Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Tue, 15 Oct 2024 20:24:27 -0700 Subject: [PATCH 16/20] #1409 issue, change datatype for linear kernels away from void * --- lib/kernels/include/kernels/linear_kernels.h | 22 +++--- lib/kernels/src/cuda/ops/linear_kernels.cu | 76 +++++++++++--------- lib/local-execution/src/ops/linear.cc | 14 ++-- 3 files changed, 59 insertions(+), 53 deletions(-) diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h index 99549adece..cff6563629 100644 --- a/lib/kernels/include/kernels/linear_kernels.h +++ b/lib/kernels/include/kernels/linear_kernels.h @@ -50,23 +50,23 @@ bool use_activation(Activation activation); void forward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *output_ptr, - void const *filter_ptr, - void const *bias_ptr, + float const *input_ptr, + float *output_ptr, + float const *filter_ptr, + float const *bias_ptr, int in_dim, int out_dim, int batch_size); void backward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, - void const *output_ptr, - void *output_grad_ptr, - void const *kernel_ptr, - void *kernel_grad_ptr, - void *bias_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *output_ptr, + float *output_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_ptr, int in_dim, int out_dim, int batch_size); diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu index ca51f0d216..29b77fd9d9 100644 --- a/lib/kernels/src/cuda/ops/linear_kernels.cu +++ b/lib/kernels/src/cuda/ops/linear_kernels.cu @@ -108,10 +108,10 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, void forward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *output_ptr, - void const *weight_ptr, - void const *bias_ptr, + float const *input_ptr, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, int in_dim, int out_dim, int batch_size) { @@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream, batch_size, in_dim, &alpha, - weight_ptr, + (void *)weight_ptr, weight_type, in_dim, - input_ptr, + (void *)input_ptr, input_type, in_dim, &beta, - output_ptr, + (void *)output_ptr, output_type, out_dim, compute_type, @@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream, batch_size, 1, &alpha, - bias_ptr, + (void *)bias_ptr, weight_type, 1, - m.one_ptr, + (void *)m.one_ptr, CUDA_R_32F, 1, &alpha, - output_ptr, + (void *)output_ptr, output_type, out_dim, compute_type, @@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream, m.actiDesc, &alpha, m.outputTensor, - output_ptr, + (void *)output_ptr, &beta, m.outputTensor, - output_ptr)); + (void *)output_ptr)); } else if (m.activation == Activation::GELU) { size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size); constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) @@ -191,13 +191,13 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, - void const *output_ptr, - void *output_grad_ptr, - void const *kernel_ptr, - void *kernel_grad_ptr, - void *bias_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *output_ptr, + float *output_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_grad_ptr, int in_dim, int out_dim, int batch_size) { @@ -216,11 +216,17 @@ void backward_kernel(cudaStream_t stream, int output_size = out_dim * batch_size; if (m.activation.has_value()) { if (m.activation == Activation::RELU) { - relu_backward_kernel( - m.output_type, output_grad_ptr, output_ptr, output_size, stream); + relu_backward_kernel(m.output_type, + (void *)output_grad_ptr, + (void *)output_ptr, + output_size, + stream); } else if (m.activation == Activation::SIGMOID) { - sigmoid_backward_kernel( - m.output_type, output_grad_ptr, output_ptr, output_size, stream); + sigmoid_backward_kernel(m.output_type, + (void *)output_grad_ptr, + (void *)output_ptr, + output_size, + stream); } else { // TODO: only support relu and sigmoid for now assert(false && "Unsupported activation for Linear"); @@ -235,14 +241,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - input_ptr, + (void *)input_ptr, input_type, in_dim, - output_grad_ptr, + (void *)output_grad_ptr, output_type, out_dim, &alpha, - kernel_grad_ptr, + (void *)kernel_grad_ptr, weight_type, in_dim, compute_type, @@ -261,12 +267,12 @@ void backward_kernel(cudaStream_t stream, in_dim, out_dim, &alpha, - (float *)kernel_grad_ptr, + kernel_grad_ptr, in_dim, &lambda, - (float *)kernel_ptr, + kernel_ptr, in_dim, - (float *)kernel_grad_ptr, + kernel_grad_ptr, in_dim)); } else { assert(false && "Only L2 regularization is supported"); @@ -284,14 +290,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - m.one_ptr, + (void *)m.one_ptr, CUDA_R_32F, 1, - output_grad_ptr, + (void *)output_grad_ptr, output_type, out_dim, &alpha, - bias_grad_ptr, + (void *)bias_grad_ptr, weight_type, 1, compute_type, @@ -307,14 +313,14 @@ void backward_kernel(cudaStream_t stream, batch_size, out_dim, &alpha, - kernel_ptr, + (void *)kernel_ptr, weight_type, in_dim, - output_grad_ptr, + (void *)output_grad_ptr, output_type, out_dim, &alpha, - input_grad_ptr, + (void *)input_grad_ptr, input_type, in_dim, compute_type, diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 9934e2a45c..860eedaa1c 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -148,13 +148,13 @@ static std::optional profiling, "[Linear] backward_time = {:.2lf}ms\n", per_device_state, - (void *)input.get_float_ptr(), - (void *)input_grad.get_float_ptr(), - (void *)output.get_float_ptr(), - (void *)output_grad.get_float_ptr(), - (void *)weight.get_float_ptr(), - (void *)weight_grad.get_float_ptr(), - (void *)bias_ptr, + input.get_float_ptr(), + (float *)input_grad.get_float_ptr(), + output.get_float_ptr(), + (float *)output_grad.get_float_ptr(), + weight.get_float_ptr(), + (float *)weight_grad.get_float_ptr(), + (float *)bias_ptr, in_dim, out_dim, batch_size); From 51c3eb7ae4f84f8bc33811b5f1842187ca81c6ce Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Mon, 4 Nov 2024 23:12:02 -0800 Subject: [PATCH 17/20] R & W accessor changes, minimize code bloat --- lib/kernels/include/kernels/accessor.h | 154 ++++++++---------- lib/kernels/include/kernels/cast_kernels.h | 8 +- .../include/kernels/cast_kernels_cpu.h | 8 +- .../include/kernels/datatype_dispatch.h | 10 +- .../kernels/managed_per_device_ff_handle.h | 5 +- lib/kernels/src/accessor.cc | 107 +++--------- lib/kernels/src/cpu/cast_kernels.cc | 13 +- lib/kernels/src/cpu/replicate_kernels.cc | 9 +- lib/kernels/src/cpu/reverse_kernels.cc | 24 +-- lib/kernels/src/cuda/ops/cast_kernels.cu | 12 +- lib/kernels/src/cuda/ops/linear_kernels.cu | 42 ++--- .../src/managed_per_device_ff_handle.cc | 8 +- lib/kernels/test/src/test_attention_kernel.cc | 2 +- .../test/src/test_batch_matmul_kernel.cc | 2 +- .../test/src/test_batch_norm_kernel.cc | 6 +- lib/kernels/test/src/test_cast_kernel.cc | 25 +-- lib/kernels/test/src/test_combine_kernel.cc | 9 +- lib/kernels/test/src/test_concat_kernel.cc | 2 +- lib/kernels/test/src/test_dropout.cc | 2 +- lib/kernels/test/src/test_flat_kernel.cc | 12 +- lib/kernels/test/src/test_gather_kernels.cc | 2 +- .../test/src/test_layer_norm_kernels.cc | 8 +- .../test/src/test_managed_ff_stream.cc | 12 +- .../src/test_managed_per_device_ff_handle.cc | 14 +- lib/kernels/test/src/test_partition_kernel.cc | 10 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 6 +- lib/kernels/test/src/test_reduction_kernel.cc | 6 +- lib/kernels/test/src/test_replicate_kernel.cc | 11 +- lib/kernels/test/src/test_reshape_kernel.cc | 2 +- lib/kernels/test/src/test_reverse_kernels.cc | 17 +- lib/kernels/test/src/test_softmax_kernel.cc | 2 +- lib/kernels/test/src/test_split_kernel.cc | 4 +- lib/kernels/test/src/test_transpose_kernel.cc | 2 +- lib/kernels/test/src/test_utils.cc | 114 ++++++++++--- lib/kernels/test/src/test_utils.h | 92 ++--------- lib/local-execution/src/ops/cast.cc | 8 +- lib/local-execution/src/ops/linear.cc | 14 +- .../test/src/test_local_cost_estimator.cc | 2 +- 38 files changed, 330 insertions(+), 456 deletions(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 0a134db695..653c8db42d 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -13,54 +13,36 @@ namespace FlexFlow { struct Allocator; -class GenericTensorAccessorW { +class GenericTensorAccessorR { public: template - typename data_type_enum_to_class
::type *get() const { + typename data_type_enum_to_class
::type const *get() const { if (this->data_type == DT) { - return static_cast *>(this->ptr); + return static_cast const *>(this->ptr); } else { throw mk_runtime_error(fmt::format( "Invalid access data type ({} != {})", this->data_type, DT)); } } - int32_t *get_int32_ptr() const; - int64_t *get_int64_ptr() const; - float *get_float_ptr() const; - double *get_double_ptr() const; - half *get_half_ptr() const; + int32_t const *get_int32_ptr() const; + int64_t const *get_int64_ptr() const; + float const *get_float_ptr() const; + double const *get_double_ptr() const; + half const *get_half_ptr() const; - GenericTensorAccessorW() = delete; + GenericTensorAccessorR() = delete; - GenericTensorAccessorW(DataType data_type, + GenericTensorAccessorR(DataType data_type, ArrayShape const &shape, - void *ptr, + void const *ptr, DeviceType device_type); - bool operator==(GenericTensorAccessorW const &) const; - bool operator!=(GenericTensorAccessorW const &) const; - - template - real_type_t
&at(Indices... indices) { - if (this->device_type != DeviceType::CPU) { - throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); - } - if (this->data_type != DT) { - throw mk_runtime_error(fmt::format( - "Invalid access data type ({} != {})", this->data_type, DT)); - } - - using T = real_type_t
; - - T *data_ptr = static_cast(this->ptr); - size_t offset = calculate_index_offset({static_cast(indices)...}); - - return data_ptr[offset]; - } + bool operator==(GenericTensorAccessorR const &) const; + bool operator!=(GenericTensorAccessorR const &) const; - template - real_type_t
const &at(Indices... indices) const { + template + real_type_t
const &at(std::vector const &indices) const { if (this->device_type != DeviceType::CPU) { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } @@ -72,7 +54,7 @@ class GenericTensorAccessorW { using T = real_type_t
; T const *data_ptr = static_cast(this->ptr); - size_t offset = calculate_index_offset({static_cast(indices)...}); + size_t offset = calculate_index_offset(indices); return data_ptr[offset]; } @@ -80,7 +62,7 @@ class GenericTensorAccessorW { public: DataType data_type; ArrayShape shape; - void *ptr; + void const *ptr; DeviceType device_type; private: @@ -90,43 +72,62 @@ class GenericTensorAccessorW { decltype(device_type) const &> tie() const; - size_t calculate_index_offset( - std::initializer_list const &indices) const; + size_t calculate_index_offset(std::vector const &indices) const; }; -std::string format_as(GenericTensorAccessorW const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); +std::string format_as(GenericTensorAccessorR const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); -class GenericTensorAccessorR { +class GenericTensorAccessorW { public: template - typename data_type_enum_to_class
::type const *get() const { + typename data_type_enum_to_class
::type *get() const { if (this->data_type == DT) { - return static_cast const *>(this->ptr); + return static_cast *>(this->ptr); } else { throw mk_runtime_error(fmt::format( "Invalid access data type ({} != {})", this->data_type, DT)); } } - int32_t const *get_int32_ptr() const; - int64_t const *get_int64_ptr() const; - float const *get_float_ptr() const; - double const *get_double_ptr() const; - half const *get_half_ptr() const; + int32_t *get_int32_ptr() const; + int64_t *get_int64_ptr() const; + float *get_float_ptr() const; + double *get_double_ptr() const; + half *get_half_ptr() const; - GenericTensorAccessorR() = delete; + GenericTensorAccessorW() = delete; - GenericTensorAccessorR(DataType data_type, + GenericTensorAccessorW(DataType data_type, ArrayShape const &shape, - void const *ptr, + void *ptr, DeviceType device_type); - bool operator==(GenericTensorAccessorR const &) const; - bool operator!=(GenericTensorAccessorR const &) const; + bool operator==(GenericTensorAccessorW const &) const; + bool operator!=(GenericTensorAccessorW const &) const; + + operator GenericTensorAccessorR() const; + + template + real_type_t
&at(std::vector const &indices) { + if (this->device_type != DeviceType::CPU) { + throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); + } + if (this->data_type != DT) { + throw mk_runtime_error(fmt::format( + "Invalid access data type ({} != {})", this->data_type, DT)); + } + + using T = real_type_t
; + + T *data_ptr = static_cast(this->ptr); + size_t offset = calculate_index_offset(indices); + + return data_ptr[offset]; + } - template - real_type_t
const &at(Indices... indices) const { + template + real_type_t
&at(std::vector const &indices) const { if (this->device_type != DeviceType::CPU) { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } @@ -138,7 +139,7 @@ class GenericTensorAccessorR { using T = real_type_t
; T const *data_ptr = static_cast(this->ptr); - size_t offset = calculate_index_offset({static_cast(indices)...}); + size_t offset = calculate_index_offset(indices); return data_ptr[offset]; } @@ -146,7 +147,7 @@ class GenericTensorAccessorR { public: DataType data_type; ArrayShape shape; - void const *ptr; + void *ptr; DeviceType device_type; private: @@ -156,27 +157,11 @@ class GenericTensorAccessorR { decltype(device_type) const &> tie() const; - size_t calculate_index_offset( - std::initializer_list const &indices) const; + size_t calculate_index_offset(std::vector const &indices) const; }; -std::string format_as(GenericTensorAccessorR const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); - -int32_t *get_int32_ptr(GenericTensorAccessorW const &); -int64_t *get_int64_ptr(GenericTensorAccessorW const &); -float *get_float_ptr(GenericTensorAccessorW const &); -double *get_double_ptr(GenericTensorAccessorW const &); -half *get_half_ptr(GenericTensorAccessorW const &); -std::vector - get_int32_ptrs(std::vector const &); -std::vector - get_int64_ptrs(std::vector const &); -std::vector - get_float_ptrs(std::vector const &); -std::vector - get_double_ptrs(std::vector const &); -std::vector get_half_ptrs(std::vector const &); +std::string format_as(GenericTensorAccessorW const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); static_assert(is_fmtable const &>::value, ""); @@ -241,12 +226,8 @@ std::vector const *> GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &write_accessor); -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2); - -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype); +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2); bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, ArrayShape const &expected_shape, @@ -254,16 +235,9 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, std::pair get_shape_and_datatype(GenericTensorAccessorR const &accessor); -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor); - -void transfer_data_between_accessors( - GenericTensorAccessorW &dst_accessor, - GenericTensorAccessorR const &src_accessor); -void transfer_data_between_accessors( - GenericTensorAccessorW &dst_accessor, - GenericTensorAccessorW const &src_accessor); +void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor); GenericTensorAccessorR copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h index f67613cec6..21e76fed1d 100644 --- a/lib/kernels/include/kernels/cast_kernels.h +++ b/lib/kernels/include/kernels/cast_kernels.h @@ -8,15 +8,11 @@ namespace FlexFlow::Kernels::Cast { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h index 959617dcae..275476b4e6 100644 --- a/lib/kernels/include/kernels/cast_kernels_cpu.h +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -7,14 +7,10 @@ namespace FlexFlow::Kernels::Cast { void cpu_forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); void cpu_backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h index 0986d99791..50ca66a820 100644 --- a/lib/kernels/include/kernels/datatype_dispatch.h +++ b/lib/kernels/include/kernels/datatype_dispatch.h @@ -34,7 +34,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(Args... args) const { + Out operator()(Args &&...args) const { return F
{}(std::forward(args)...); } }; @@ -42,7 +42,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(DataType data_type, Args... args) { + Out operator()(DataType data_type, Args &&...args) { return dispatch(data_type, std::forward(args)...); } }; @@ -55,13 +55,13 @@ struct DataTypeDispatch2 { template struct OutputType { template - void operator()(Args... args) const { + void operator()(Args &&...args) const { F{}(std::forward(args)...); } }; template - void operator()(DataType output_type, Args... args) const { + void operator()(DataType output_type, Args &&...args) const { dispatch(output_type, std::forward(args)...); } }; @@ -69,7 +69,7 @@ struct DataTypeDispatch2 { template void operator()(DataType input_data_type, DataType output_data_type, - Args... args) { + Args &&...args) { dispatch( input_data_type, output_data_type, std::forward(args)...); } diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index 0a83a5eecb..f9f944c6ff 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -7,7 +7,10 @@ namespace FlexFlow { struct ManagedPerDeviceFFHandle { public: - ManagedPerDeviceFFHandle(); + ManagedPerDeviceFFHandle() = delete; + + ManagedPerDeviceFFHandle(size_t workSpaceSize, + bool allowTensorOpMathConversion); ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete; ManagedPerDeviceFFHandle & diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index 9332dd6703..4cb5bd83a2 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -4,7 +4,7 @@ namespace FlexFlow { -void transfer_data_between_accessors( +void copy_accessor_data_to_l_from_r( GenericTensorAccessorW &dst_accessor, GenericTensorAccessorR const &src_accessor) { size_t num_bytes = dst_accessor.shape.get_volume() * @@ -25,6 +25,8 @@ void transfer_data_between_accessors( checkCUDA(cudaMemcpy( dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost)); } else { + assert(src_device_type == DeviceType::GPU); + assert(src_device_type == DeviceType::CPU); checkCUDA(cudaMemcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes, @@ -32,12 +34,8 @@ void transfer_data_between_accessors( } } -void transfer_data_between_accessors( - GenericTensorAccessorW &dst_accessor, - GenericTensorAccessorW const &src_accessor) { - GenericTensorAccessorR r_src_accessor = - read_only_accessor_from_write_accessor(src_accessor); - transfer_data_between_accessors(dst_accessor, r_src_accessor); +GenericTensorAccessorW::operator GenericTensorAccessorR() const { + return read_only_accessor_from_write_accessor(*this); } GenericTensorAccessorW::GenericTensorAccessorW( @@ -56,7 +54,7 @@ std::tuple const &indices) const { + std::vector const &indices) const { if (indices.size() != this->shape.num_dims()) { throw mk_runtime_error(fmt::format( @@ -67,22 +65,18 @@ size_t GenericTensorAccessorW::calculate_index_offset( size_t offset = 0; size_t multiplier = 1; - size_t cur_idx; - auto it = indices.begin(); for (size_t i = 0; i < this->shape.num_dims(); i++) { - cur_idx = *it++; - - if (cur_idx >= this->shape.at(legion_dim_t(i))) { + if (indices[i] >= this->shape.at(legion_dim_t(i))) { throw mk_runtime_error( fmt::format("In {} dimension, attempting to access index {} " "when only {} indexes exist", i, - cur_idx, + indices[i], this->shape.at(legion_dim_t(i)))); } - offset += cur_idx * multiplier; + offset += indices[i] * multiplier; multiplier *= this->shape.at(legion_dim_t(i)); } @@ -146,7 +140,7 @@ std::tuple const &indices) const { + std::vector const &indices) const { if (indices.size() != this->shape.num_dims()) { throw mk_runtime_error(fmt::format( @@ -155,24 +149,20 @@ size_t GenericTensorAccessorR::calculate_index_offset( this->shape.num_dims())); } - size_t offset = 0; + ssize_t offset = 0; size_t multiplier = 1; - size_t cur_idx; - auto it = indices.begin(); for (size_t i = 0; i < this->shape.num_dims(); i++) { - cur_idx = *it++; - - if (cur_idx >= this->shape.at(legion_dim_t(i))) { + if (indices[i] >= this->shape.at(legion_dim_t(i))) { throw mk_runtime_error( fmt::format("In {} dimension, attempting to access index {} " "when only {} indexes exist", i, - cur_idx, + indices[i], this->shape.at(legion_dim_t(i)))); } - offset += cur_idx * multiplier; + offset += indices[i] * multiplier; multiplier *= this->shape.at(legion_dim_t(i)); } @@ -220,51 +210,6 @@ std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) { return (s << fmt::to_string(a)); } -int32_t *get_int32_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -int64_t *get_int64_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -float *get_float_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -double *get_double_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -half *get_half_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -std::vector - get_int32_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_int64_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_float_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_double_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_half_ptrs(std::vector const &a) { - return get(a); -} - int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) { return get(a); } @@ -318,18 +263,11 @@ GenericTensorAccessorR read_only_accessor_from_write_accessor( writable.device_type}; } -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2) { +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2) { return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; } -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype) { - return accessor.shape == expected_shape && - accessor.data_type == expected_dtype; -} - bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, ArrayShape const &expected_shape, DataType const &expected_dtype) { @@ -342,11 +280,6 @@ std::pair return std::make_pair(accessor.shape, accessor.data_type); } -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor) { - return std::make_pair(accessor.shape, accessor.data_type); -} - template struct CopyTensorAccessorW { GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, @@ -355,7 +288,7 @@ struct CopyTensorAccessorW { get_tensor_shape(src_accessor.shape, src_accessor.data_type); GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); - transfer_data_between_accessors(dst_accessor, src_accessor); + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); return dst_accessor; } @@ -365,7 +298,7 @@ GenericTensorAccessorW copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, Allocator &allocator) { return DataTypeDispatch1{}( - src_accessor.data_type, src_accessor, std::ref(allocator)); + src_accessor.data_type, src_accessor, allocator); } template @@ -376,7 +309,7 @@ struct CopyTensorAccessorR { get_tensor_shape(src_accessor.shape, src_accessor.data_type); GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); - transfer_data_between_accessors(dst_accessor, src_accessor); + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); return read_only_accessor_from_write_accessor(dst_accessor); } @@ -386,7 +319,7 @@ GenericTensorAccessorR copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, Allocator &allocator) { return DataTypeDispatch1{}( - src_accessor.data_type, src_accessor, std::ref(allocator)); + src_accessor.data_type, src_accessor, allocator); } } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc index 2d3f440c75..5a00503fe4 100644 --- a/lib/kernels/src/cpu/cast_kernels.cc +++ b/lib/kernels/src/cpu/cast_kernels.cc @@ -37,18 +37,15 @@ struct CPUBackwardKernel { }; void cpu_forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { - DataTypeDispatch2{}(input_type, output_type, input, output); + GenericTensorAccessorW const &output) { + DataTypeDispatch2{}( + input.data_type, output.data_type, input, output); } void cpu_backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input_type, output_type, input, output); + input.data_type, output.data_type, input, output); } } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index 683739b91e..25693b374d 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -22,24 +22,23 @@ struct CPUBackwardKernel { for (size_t i = 0; i < input.shape.num_elements(); i++) { T cur_sum = 0; for (size_t j = 0; j < num_replicas; j++) { - cur_sum += output.at
(i, j); + cur_sum += output.at
({i, j}); } - input.at
(i) = cur_sum; + input.at
({i}) = cur_sum; } } }; void cpu_forward_kernel(GenericTensorAccessorR const &input, GenericTensorAccessorW &output) { - DataTypeDispatch1{}( - input.data_type, input, std::ref(output)); + DataTypeDispatch1{}(input.data_type, input, output); } void cpu_backward_kernel(GenericTensorAccessorR const &output, GenericTensorAccessorW &input, size_t num_replicas) { DataTypeDispatch1{}( - input.data_type, output, std::ref(input), num_replicas); + input.data_type, output, input, num_replicas); } } // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index bc114c4e60..e5b3719d74 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -11,17 +11,17 @@ struct CPUReverseForwardKernel { GenericTensorAccessorW &output) { assert(input.data_type == DT && output.data_type == DT); - coord_t num_out_blocks = input.shape.at(legion_dim_t(0)); - coord_t reverse_dim_size = input.shape.at(legion_dim_t(1)); - coord_t in_block_size = input.shape.at(legion_dim_t(2)); + size_t num_out_blocks = input.shape.at(legion_dim_t(0)); + size_t reverse_dim_size = input.shape.at(legion_dim_t(1)); + size_t in_block_size = input.shape.at(legion_dim_t(2)); - for (coord_t block_idx = 0; block_idx < num_out_blocks; block_idx++) { - for (coord_t rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { - for (coord_t i = 0; i < in_block_size; i++) { - output.at
(block_idx, rev_idx, i) = - input.at
(num_out_blocks - 1 - block_idx, - reverse_dim_size - 1 - rev_idx, - in_block_size - 1 - i); + for (size_t block_idx = 0; block_idx < num_out_blocks; block_idx++) { + for (size_t rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { + for (size_t i = 0; i < in_block_size; i++) { + output.at
({block_idx, rev_idx, i}) = + input.at
({num_out_blocks - 1 - block_idx, + reverse_dim_size - 1 - rev_idx, + in_block_size - 1 - i}); } } } @@ -31,13 +31,13 @@ struct CPUReverseForwardKernel { void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, GenericTensorAccessorW &output_accessor) { DataTypeDispatch1{}( - input_accessor.data_type, input_accessor, std::ref(output_accessor)); + input_accessor.data_type, input_accessor, output_accessor); } void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, GenericTensorAccessorW &input_accessor) { DataTypeDispatch1{}( - output_accessor.data_type, output_accessor, std::ref(input_accessor)); + output_accessor.data_type, output_accessor, input_accessor); } } // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu index b895ffb68f..dc342fd0e0 100644 --- a/lib/kernels/src/cuda/ops/cast_kernels.cu +++ b/lib/kernels/src/cuda/ops/cast_kernels.cu @@ -60,20 +60,16 @@ struct BackwardKernel { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + input.data_type, output.data_type, stream, input, output); } void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + input.data_type, output.data_type, stream, input, output); } } // namespace Cast diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu index 29b77fd9d9..f13ebee67e 100644 --- a/lib/kernels/src/cuda/ops/linear_kernels.cu +++ b/lib/kernels/src/cuda/ops/linear_kernels.cu @@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream, batch_size, in_dim, &alpha, - (void *)weight_ptr, + reinterpret_cast(weight_ptr), weight_type, in_dim, - (void *)input_ptr, + reinterpret_cast(input_ptr), input_type, in_dim, &beta, - (void *)output_ptr, + reinterpret_cast(output_ptr), output_type, out_dim, compute_type, @@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream, batch_size, 1, &alpha, - (void *)bias_ptr, + reinterpret_cast(bias_ptr), weight_type, 1, - (void *)m.one_ptr, + reinterpret_cast(m.one_ptr), CUDA_R_32F, 1, &alpha, - (void *)output_ptr, + reinterpret_cast(output_ptr), output_type, out_dim, compute_type, @@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream, m.actiDesc, &alpha, m.outputTensor, - (void *)output_ptr, + reinterpret_cast(output_ptr), &beta, m.outputTensor, - (void *)output_ptr)); + reinterpret_cast(output_ptr))); } else if (m.activation == Activation::GELU) { size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size); constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) @@ -217,14 +217,14 @@ void backward_kernel(cudaStream_t stream, if (m.activation.has_value()) { if (m.activation == Activation::RELU) { relu_backward_kernel(m.output_type, - (void *)output_grad_ptr, - (void *)output_ptr, + reinterpret_cast(output_grad_ptr), + reinterpret_cast(output_ptr), output_size, stream); } else if (m.activation == Activation::SIGMOID) { sigmoid_backward_kernel(m.output_type, - (void *)output_grad_ptr, - (void *)output_ptr, + reinterpret_cast(output_grad_ptr), + reinterpret_cast(output_ptr), output_size, stream); } else { @@ -241,14 +241,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - (void *)input_ptr, + reinterpret_cast(input_ptr), input_type, in_dim, - (void *)output_grad_ptr, + reinterpret_cast(output_grad_ptr), output_type, out_dim, &alpha, - (void *)kernel_grad_ptr, + reinterpret_cast(kernel_grad_ptr), weight_type, in_dim, compute_type, @@ -290,14 +290,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - (void *)m.one_ptr, + reinterpret_cast(m.one_ptr), CUDA_R_32F, 1, - (void *)output_grad_ptr, + reinterpret_cast(output_grad_ptr), output_type, out_dim, &alpha, - (void *)bias_grad_ptr, + reinterpret_cast(bias_grad_ptr), weight_type, 1, compute_type, @@ -313,14 +313,14 @@ void backward_kernel(cudaStream_t stream, batch_size, out_dim, &alpha, - (void *)kernel_ptr, + reinterpret_cast(kernel_ptr), weight_type, in_dim, - (void *)output_grad_ptr, + reinterpret_cast(output_grad_ptr), output_type, out_dim, &alpha, - (void *)input_grad_ptr, + reinterpret_cast(input_grad_ptr), input_type, in_dim, compute_type, diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index ca105f9bc9..5bd49dc26f 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -3,10 +3,11 @@ namespace FlexFlow { -ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() { +ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( + size_t workSpaceSize, bool allowTensorOpMathConversion) { this->handle = new PerDeviceFFHandle; - this->handle->workSpaceSize = 1024 * 1024; - this->handle->allowTensorOpMathConversion = true; + this->handle->workSpaceSize = workSpaceSize; + this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion; checkCUDNN(cudnnCreate(&this->handle->dnn)); checkCUBLAS(cublasCreate(&this->handle->blas)); @@ -37,7 +38,6 @@ ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() { checkCUBLAS(cublasDestroy(this->handle->blas)); checkCUDA(cudaFree(this->handle->workSpace)); delete this->handle; - this->handle = nullptr; } } diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index 5245fab915..aae3676107 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) { size_t qoSeqLength = 20, kvSeqLength = 20; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index c08e08fd08..b87f3978b5 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) { size_t seq_length = -1; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index a8a26b8eaf..a258a27a34 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -9,7 +9,7 @@ TEST_SUITE(FF_TEST_SUITE) { size_t output_n = 1, output_c = 10, output_h = 10, output_w = 10; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); Allocator allocator = create_local_cuda_memory_allocator(); @@ -37,11 +37,11 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW scale_accessor = - create_filled_accessor_w(scale_shape, allocator, 1.0f); + create_filled_accessor_w(scale_shape, allocator, DataTypeValue(1.0f)); SUBCASE("forward_kernel") { GenericTensorAccessorW bias_accessor = - create_filled_accessor_w(bias_shape, allocator, 0.0f); + create_filled_accessor_w(bias_shape, allocator, DataTypeValue(0.0f)); Kernels::BatchNorm::forward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index c5b1d98bb1..1be5839a9c 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -21,11 +21,8 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - Kernels::Cast::forward_kernel(managed_stream.raw_stream(), - input_accessor, - output_accessor, - DataType::FLOAT, - DataType::DOUBLE); + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor, output_accessor); CHECK(contains_non_zero(output_accessor)); } @@ -38,9 +35,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Cast::backward_kernel(managed_stream.raw_stream(), grad_output_accessor, - grad_input_accessor, - DataType::DOUBLE, - DataType::FLOAT); + grad_input_accessor); CHECK(contains_non_zero(grad_input_accessor)); } @@ -65,11 +60,8 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW output_accessor_gpu = create_zero_filled_accessor_w(output_shape, gpu_allocator); - Kernels::Cast::forward_kernel(managed_stream.raw_stream(), - input_accessor_gpu, - output_accessor_gpu, - DataType::FLOAT, - DataType::DOUBLE); + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); // Run CPU Forward Kernel GenericTensorAccessorR input_accessor_cpu = @@ -78,12 +70,9 @@ TEST_SUITE(FF_TEST_SUITE) { create_zero_filled_accessor_w(output_shape, cpu_allocator); Kernels::Cast::cpu_forward_kernel(input_accessor_cpu, - output_accessor_cpu, - DataType::FLOAT, - DataType::DOUBLE); + output_accessor_cpu); - CHECK(w_accessors_are_equal(output_accessor_gpu, - output_accessor_cpu)); + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 89d06dff96..60179ee75b 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -6,7 +6,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Call Combine Forward and Backward Kernels") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -70,8 +70,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Combine::cpu_forward_kernel(input_accessor_cpu, output_accessor_cpu); - CHECK(w_accessors_are_equal(output_accessor_gpu, - output_accessor_cpu)); + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); } SUBCASE("backward_kernel") { @@ -94,8 +93,8 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu, input_grad_accessor_cpu); - CHECK(w_accessors_are_equal(input_grad_accessor_gpu, - input_grad_accessor_cpu)); + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index b30995cf15..841d53133c 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) { size_t size_per_input = 10; ff_dim_t concat_axis = ff_dim_t(1); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; TensorShape input_shape = diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 2c7e2657f7..bee00d990d 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 3a3e3b28b7..9febf4bcc4 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -7,7 +7,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Flat Kernel") { Allocator allocator = create_local_cuda_memory_allocator(); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; TensorShape input_shape = @@ -15,8 +15,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = input_shape; GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 2.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, DataTypeValue(2.0f))); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -30,10 +30,10 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 0.0f); + GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w( + output_shape, allocator, DataTypeValue(0.0f)); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); + create_filled_accessor_w(input_shape, allocator, DataTypeValue(1.0f)); Kernels::Flat::backward_kernel(managed_stream.raw_stream(), input_accessor, diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index fd7a8ab47a..4f9fa02a1a 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -5,7 +5,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Gather Forward and Backward Kernel") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index b667716181..87fc88f081 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -17,7 +17,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape feature_shape = make_tensor_shape_from_legion_dims({feature_size}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -33,13 +33,13 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR input_accessor = create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW gamma_accessor = - create_filled_accessor_w(feature_shape, allocator, 1.0f); + create_filled_accessor_w(feature_shape, allocator, DataTypeValue(1.0f)); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - GenericTensorAccessorW beta_accessor = - create_filled_accessor_w(feature_shape, allocator, 0.0f); + GenericTensorAccessorW beta_accessor = create_filled_accessor_w( + feature_shape, allocator, DataTypeValue(0.0f)); Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc index 1dedb0c41d..ce8a808454 100644 --- a/lib/kernels/test/src/test_managed_ff_stream.cc +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -6,24 +6,24 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Managed FF Stream") { ManagedFFStream base_stream{}; + ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); SUBCASE("Test ManagedFFStream Move Constructor") { - ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); - ManagedFFStream new_stream(std::move(base_stream)); - CHECK(&base_stream.raw_stream() == nullptr); CHECK(&new_stream.raw_stream() == base_stream_ptr); } SUBCASE("Test ManagedFFStream Assignment Operator") { - ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); - ManagedFFStream new_stream{}; new_stream = std::move(base_stream); - CHECK(&base_stream.raw_stream() == nullptr); CHECK(&new_stream.raw_stream() == base_stream_ptr); } + + SUBCASE("Test Self-Assignment") { + base_stream = std::move(base_stream); + CHECK(&base_stream.raw_stream() == base_stream_ptr); + } } } diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc index e85cfd61c7..d39da03ba9 100644 --- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -5,7 +5,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Managed Per Device FF Handle") { - ManagedPerDeviceFFHandle base_handle{}; + ManagedPerDeviceFFHandle base_handle{1024 * 1024, true}; + PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); SUBCASE("Test ManagedPerDeviceFFHandle Constructor") { CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024); @@ -13,8 +14,6 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Test ManagedPerDeviceFFHandle Move Constructor") { - PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); - ManagedPerDeviceFFHandle new_handle(std::move(base_handle)); CHECK(&base_handle.raw_handle() == nullptr); @@ -22,13 +21,16 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Test ManagedPerDeviceFFHandle Assignment Operator") { - PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); - - ManagedPerDeviceFFHandle new_handle{}; + ManagedPerDeviceFFHandle new_handle{1024 * 1024, true}; new_handle = std::move(base_handle); CHECK(&base_handle.raw_handle() == nullptr); CHECK(&new_handle.raw_handle() == base_handle_ptr); } + + SUBCASE("Test Self-Assignment") { + base_handle = std::move(base_handle); + CHECK(&base_handle.raw_handle() == base_handle_ptr); + } } } diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 7110128885..079af64a8c 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -6,7 +6,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Partition Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -20,7 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - create_filled_accessor_r(input_shape, allocator, 1.0f); + create_filled_accessor_r(input_shape, allocator, DataTypeValue(1.0f)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -31,10 +31,10 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - create_filled_accessor_r(output_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, DataTypeValue(1.0f)); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + create_filled_accessor_w(input_shape, allocator, DataTypeValue(2.0f)); Kernels::Repartition::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 52a177dd72..76b966ea15 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) { PoolOp pool_type = PoolOp::MAX; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -56,8 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w( + output_shape, allocator, DataTypeValue(1.0f)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 8706c5d877..ddbe826c70 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = make_tensor_shape_from_legion_dims( {10, 10, 10, 10, 10}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -35,8 +35,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { TensorShape output_shape = input_shape; - GenericTensorAccessorR output_grad_accessor = - create_filled_accessor_r(output_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, DataTypeValue(1.0f)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 77f4001328..1d9e0677b7 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -53,7 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator gpu_allocator = create_local_cuda_memory_allocator(); @@ -78,8 +78,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, output_accessor_cpu); - CHECK(w_accessors_are_equal(output_accessor_gpu, - output_accessor_cpu)); + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); } SUBCASE("backward_kernel") { @@ -103,8 +102,8 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Replicate::cpu_backward_kernel( output_grad_accessor_cpu, input_grad_accessor_cpu, num_replicas); - CHECK(w_accessors_are_equal(input_grad_accessor_gpu, - input_grad_accessor_cpu)); + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index 92a61524a3..41aaac9c3e 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -5,7 +5,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Reshape Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 4e98ea701b..436b788a99 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -14,15 +14,15 @@ TEST_SUITE(FF_TEST_SUITE) { {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, DataTypeValue(1.0f))); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -57,7 +57,7 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { - std::size_t num_out_blks = 1; + std::size_t num_out_blks = 4; std::size_t reverse_dim_size = 3; std::size_t in_blk_size = 2; @@ -65,7 +65,7 @@ TEST_SUITE(FF_TEST_SUITE) { {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator gpu_allocator = create_local_cuda_memory_allocator(); @@ -99,8 +99,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Reverse::cpu_forward_kernel(input_accessor_cpu, output_accessor_cpu); - CHECK(w_accessors_are_equal(output_accessor_cpu, - output_accessor_cpu)); + CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu)); } SUBCASE("backward_kernel") { @@ -128,8 +127,8 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Reverse::cpu_backward_kernel(output_grad_accessor_cpu, input_grad_accessor_cpu); - CHECK(w_accessors_are_equal(input_grad_accessor_gpu, - input_grad_accessor_cpu)); + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index f723a9ca46..b293d1ce75 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -8,7 +8,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Softmax Kernel Operations") { int input_n = 1, input_c = 1, input_h = 1, input_w = 100, channels = 100; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index a3cf215dff..114077d6ec 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) { coord_t in_blk_size = 100; coord_t num_blks = 1; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -50,7 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) { } GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 0.0f); + create_filled_accessor_w(input_shape, allocator, DataTypeValue(0.0f)); Kernels::Split::backward_kernel(managed_stream.raw_stream(), input_grad_accessor.get_float_ptr(), diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index d5d0b00576..5c5e9b31f8 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -9,7 +9,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::vector perm = {ff_dim_t(0), ff_dim_t(1)}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index 103c866c10..a59747b376 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -53,7 +53,7 @@ struct CreateRandomFilledAccessorW { } GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); - transfer_data_between_accessors(dst_accessor, src_accessor); + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); return dst_accessor; } @@ -62,7 +62,7 @@ struct CreateRandomFilledAccessorW { GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, Allocator &allocator) { return DataTypeDispatch1{}( - shape.data_type, shape, std::ref(allocator)); + shape.data_type, shape, allocator); } GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, @@ -111,20 +111,14 @@ struct CPUAccessorRContainsNonZero { bool contains_non_zero(GenericTensorAccessorR const &accessor) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorR cpu_accessor = - create_cpu_compatible_accessor_r(accessor, cpu_allocator); + copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); return DataTypeDispatch1{}( cpu_accessor.data_type, cpu_accessor); } -bool contains_non_zero(GenericTensorAccessorW const &accessor) { - GenericTensorAccessorR r_accessor = - read_only_accessor_from_write_accessor(accessor); - return contains_non_zero(r_accessor); -} - GenericTensorAccessorR - create_cpu_compatible_accessor_r(GenericTensorAccessorR const &accessor, - Allocator &cpu_allocator) { + copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, + Allocator &cpu_allocator) { GenericTensorAccessorR cpu_accessor = accessor; if (accessor.device_type == DeviceType::GPU) { cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator); @@ -133,8 +127,8 @@ GenericTensorAccessorR } GenericTensorAccessorW - create_cpu_compatible_accessor_w(GenericTensorAccessorW const &accessor, - Allocator &cpu_allocator) { + copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor, + Allocator &cpu_allocator) { GenericTensorAccessorW cpu_accessor = accessor; if (accessor.device_type == DeviceType::GPU) { cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator); @@ -144,28 +138,102 @@ GenericTensorAccessorW template struct PrintCPUAccessorR { - void operator()(GenericTensorAccessorR const &accessor) { + void operator()(GenericTensorAccessorR const &accessor, + std::ostream &stream) { using T = real_type_t
; T const *data_ptr = accessor.get
(); for (size_t i = 0; i < accessor.shape.num_elements(); i++) { - std::cout << data_ptr[i] << " "; + stream << data_ptr[i] << " "; } - std::cout << "\n"; + stream << "\n"; } }; -void print_accessor(GenericTensorAccessorR const &accessor) { +void print_tensor_accessor_contents(GenericTensorAccessorR const &accessor, + std::ostream &stream) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorR cpu_accessor = - create_cpu_compatible_accessor_r(accessor, cpu_allocator); - DataTypeDispatch1{}(accessor.data_type, accessor); + copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); + DataTypeDispatch1{}(accessor.data_type, accessor, stream); } -void print_accessor(GenericTensorAccessorW const &accessor) { - GenericTensorAccessorR r_accessor = - read_only_accessor_from_write_accessor(accessor); - print_accessor(r_accessor); +template +struct AccessorsAreEqual { + bool operator()(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor_a = + copy_accessor_r_to_cpu_if_necessary(accessor_a, cpu_allocator); + GenericTensorAccessorR cpu_accessor_b = + copy_accessor_r_to_cpu_if_necessary(accessor_b, cpu_allocator); + + using T = real_type_t
; + T const *a_data_ptr = cpu_accessor_a.get
(); + T const *b_data_ptr = cpu_accessor_b.get
(); + + for (size_t i = 0; i < accessor_a.shape.num_elements(); i++) { + if (a_data_ptr[i] != b_data_ptr[i]) { + return false; + } + } + + return true; + } +}; + +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + if (accessor_a.shape != accessor_b.shape) { + throw mk_runtime_error( + fmt::format("accessors_are_equal expected accessors to have the same " + "shape, but received: {} != {}", + accessor_a.shape, + accessor_b.shape)); + } + return DataTypeDispatch1{}( + accessor_a.data_type, accessor_a, accessor_b); +} + +template +struct CreateFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + using T = real_type_t
; + if (!val.template has()) { + throw mk_runtime_error("create_filed_accessor expected data type of " + "shape and passed-in value to match"); + } + + auto unwrapped_value = val.get(); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); + + T *data_ptr = src_accessor.get
(); + for (size_t i = 0; i < dst_accessor.shape.num_elements(); i++) { + data_ptr[i] = unwrapped_value; + } + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + return dst_accessor; + } +}; + +GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + + return DataTypeDispatch1{}( + shape.data_type, shape, allocator, val); } +GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + GenericTensorAccessorW w_accessor = + create_filled_accessor_w(shape, allocator, val); + return read_only_accessor_from_write_accessor(w_accessor); +} } // namespace FlexFlow diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index 4de114bd48..efbbc90e08 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -8,6 +8,7 @@ #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" #include "op-attrs/datatype.h" +#include "op-attrs/datatype_value.dtg.h" namespace FlexFlow { @@ -24,103 +25,30 @@ TensorShape make_tensor_shape_from_legion_dims(LegionOrdered const &dims, DataType DT); -bool contains_non_zero(GenericTensorAccessorW const &accessor); - bool contains_non_zero(GenericTensorAccessorR const &accessor); void fill_with_zeros(GenericTensorAccessorW const &accessor); GenericTensorAccessorW - create_cpu_compatible_accessor_w(GenericTensorAccessorW const &accessor, - Allocator &allocator); + copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor, + Allocator &allocator); GenericTensorAccessorR - create_cpu_compatible_accessor_r(GenericTensorAccessorR const &accessor, - Allocator &allocator); - -void print_accessor(GenericTensorAccessorR const &accessor); - -void print_accessor(GenericTensorAccessorW const &accessor); - -template -struct CreateFilledAccessorW { - GenericTensorAccessorW operator()(TensorShape const &shape, - Allocator &allocator, - real_type_t
val) { - using T = real_type_t
; + copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, + Allocator &allocator); - GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); +void print_tensor_accessor_contents(GenericTensorAccessorR const &accessor); - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b); - T *data_ptr = src_accessor.get
(); - for (size_t i = 0; i < dst_accessor.shape.num_elements(); i++) { - data_ptr[i] = val; - } - - transfer_data_between_accessors(dst_accessor, src_accessor); - return dst_accessor; - } -}; - -template GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, Allocator &allocator, - T val) { - return DataTypeDispatch1{}( - shape.data_type, shape, std::ref(allocator), val); -} + DataTypeValue val); -template GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, Allocator &allocator, - T val) { - GenericTensorAccessorW w_accessor = - create_filled_accessor_w(shape, allocator, val); - return read_only_accessor_from_write_accessor(w_accessor); -} - -template -bool w_accessors_are_equal(GenericTensorAccessorW const &accessor_a, - GenericTensorAccessorW const &accessor_b) { - if (accessor_a.shape.num_dims() != accessor_b.shape.num_dims()) { - throw mk_runtime_error( - "Comparing equivalence for two accessors of differing dimensions"); - } - for (size_t i = 0; i < accessor_a.shape.num_dims(); i++) { - if (accessor_a.shape[legion_dim_t(i)] != - accessor_b.shape[legion_dim_t(i)]) { - throw mk_runtime_error( - "Comparing equivalence for two accessors of differing shape"); - } - } - - if (accessor_a.data_type != accessor_b.data_type) { - return false; - } - - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorW cpu_accessor_a = - create_cpu_compatible_accessor_w(accessor_a, cpu_allocator); - GenericTensorAccessorW cpu_accessor_b = - create_cpu_compatible_accessor_w(accessor_b, cpu_allocator); - - using T = real_type_t
; - T *a_data_ptr = cpu_accessor_a.get
(); - T *b_data_ptr = cpu_accessor_b.get
(); - - for (size_t i = 0; i < accessor_a.shape.num_elements(); i++) { - if (a_data_ptr[i] != b_data_ptr[i]) { - print_accessor(cpu_accessor_a); - print_accessor(cpu_accessor_b); - return false; - } - } - - return true; -} - + DataTypeValue val); } // namespace FlexFlow #endif diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc index 3e7baf49a9..e9adf88422 100644 --- a/lib/local-execution/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -54,9 +54,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { profiling, "[Cast] forward_time = {:.2lf}ms\n", input, - output, - input.data_type, - attrs.dtype); + output); } static std::optional @@ -73,9 +71,7 @@ static std::optional profiling, "[Cast] forward_time = {:.2lf}ms\n", input_grad, - output_grad, - input.data_type, - attrs.dtype); + output_grad); } TaskImplFunction get_cast_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 860eedaa1c..1c5d5136cd 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -125,17 +125,17 @@ static std::optional auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); - auto bias = acc.get_tensor(BIAS); + auto bias = acc.get_tensor(BIAS); auto input_grad = acc.get_tensor_grad(INPUT); auto weight_grad = acc.get_tensor_grad(WEIGHT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); - float const *bias_ptr = NULL; + float *bias_ptr = NULL; if (attrs.use_bias) { bias_ptr = bias.get_float_ptr(); } @@ -149,12 +149,12 @@ static std::optional "[Linear] backward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), - (float *)input_grad.get_float_ptr(), + input_grad.get_float_ptr(), output.get_float_ptr(), - (float *)output_grad.get_float_ptr(), + output_grad.get_float_ptr(), weight.get_float_ptr(), - (float *)weight_grad.get_float_ptr(), - (float *)bias_ptr, + weight_grad.get_float_ptr(), + bias_ptr, in_dim, out_dim, batch_size); diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index da3af6e3ad..788ab52a7a 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -12,7 +12,7 @@ // TEST_SUITE(FF_CUDA_TEST_SUITE) { // TEST_CASE("Local Cost Estimator") { // // local backing initialization -// ManagedPerDeviceFFHandle managed_handle{}; +// ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); // RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ // DeviceSpecific::create(managed_handle.raw_handle()), From 878cff100d792427b5f990e137f79bddb6750d47 Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Fri, 15 Nov 2024 17:09:37 -0800 Subject: [PATCH 18/20] code formatting and refactor --- lib/kernels/include/kernels/accessor.h | 103 +++++++++++++---- .../include/kernels/copy_tensor_accessor.h | 19 ++++ .../include/kernels/managed_ff_stream.h | 2 + .../kernels/managed_per_device_ff_handle.h | 2 + lib/kernels/src/accessor.cc | 104 +----------------- lib/kernels/src/copy_tensor_accessor.cc | 48 ++++++++ lib/kernels/src/cpu/replicate_kernels.cc | 4 +- lib/kernels/src/cpu/reverse_kernels.cc | 12 +- lib/kernels/src/cuda/ops/linear_kernels.cu | 42 +++---- lib/kernels/src/managed_ff_stream.cc | 9 +- .../src/managed_per_device_ff_handle.cc | 13 +-- lib/kernels/test/src/test_attention_kernel.cc | 4 +- .../test/src/test_batch_matmul_kernel.cc | 4 +- .../test/src/test_batch_norm_kernel.cc | 13 ++- lib/kernels/test/src/test_combine_kernel.cc | 4 +- lib/kernels/test/src/test_concat_kernel.cc | 6 +- lib/kernels/test/src/test_dropout.cc | 4 +- lib/kernels/test/src/test_flat_kernel.cc | 13 ++- lib/kernels/test/src/test_gather_kernels.cc | 4 +- .../test/src/test_layer_norm_kernels.cc | 11 +- .../test/src/test_managed_ff_stream.cc | 24 ++-- .../src/test_managed_per_device_ff_handle.cc | 26 +++-- lib/kernels/test/src/test_partition_kernel.cc | 15 ++- lib/kernels/test/src/test_pool_2d_kernels.cc | 7 +- lib/kernels/test/src/test_reduction_kernel.cc | 7 +- lib/kernels/test/src/test_replicate_kernel.cc | 8 +- lib/kernels/test/src/test_reshape_kernel.cc | 4 +- lib/kernels/test/src/test_reverse_kernels.cc | 11 +- lib/kernels/test/src/test_softmax_kernel.cc | 4 +- lib/kernels/test/src/test_split_kernel.cc | 9 +- lib/kernels/test/src/test_transpose_kernel.cc | 6 +- lib/kernels/test/src/test_utils.cc | 24 ++-- lib/kernels/test/src/test_utils.h | 3 +- lib/local-execution/src/ops/pool_2d.cc | 16 +-- lib/local-execution/src/ops/reverse.cc | 12 +- .../test/src/test_local_cost_estimator.cc | 6 +- .../op-attrs/dim_ordered/dim_ordered.h | 5 +- .../include/op-attrs/make_datatype_value.h | 16 +++ .../src/op-attrs/make_datatype_value.cc | 25 +++++ lib/op-attrs/src/op-attrs/ops/attention.cc | 6 +- .../src/op-attrs/parallel_tensor_shape.cc | 2 +- lib/pcg/src/pcg/computation_graph_builder.cc | 25 +++-- .../parallel_computation_graph_builder.cc | 9 +- lib/runtime/src/ops/embedding.cc | 2 +- 44 files changed, 417 insertions(+), 276 deletions(-) create mode 100644 lib/kernels/include/kernels/copy_tensor_accessor.h create mode 100644 lib/kernels/src/copy_tensor_accessor.cc create mode 100644 lib/op-attrs/include/op-attrs/make_datatype_value.h create mode 100644 lib/op-attrs/src/op-attrs/make_datatype_value.cc diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 653c8db42d..487bc1f8f0 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -11,8 +11,6 @@ namespace FlexFlow { -struct Allocator; - class GenericTensorAccessorR { public: template @@ -42,7 +40,7 @@ class GenericTensorAccessorR { bool operator!=(GenericTensorAccessorR const &) const; template - real_type_t
const &at(std::vector const &indices) const { + real_type_t
const &at(std::vector const &indices) const { if (this->device_type != DeviceType::CPU) { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } @@ -50,11 +48,31 @@ class GenericTensorAccessorR { throw mk_runtime_error(fmt::format( "Invalid access data type ({} != {})", this->data_type, DT)); } + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error(fmt::format("Number of indices ({}) does not " + "match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims())); + } using T = real_type_t
; - T const *data_ptr = static_cast(this->ptr); - size_t offset = calculate_index_offset(indices); + + int offset = 0; + int multiplier = 1; + for (int i = 0; i < this->shape.num_dims(); i++) { + if (indices.at(i) >= this->shape.at(legion_dim_t{i})) { + throw mk_runtime_error( + fmt::format("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + indices.at(i), + this->shape.at(legion_dim_t{i}))); + } + + offset += indices.at(i) * multiplier; + multiplier *= this->shape.at(legion_dim_t{i}); + } return data_ptr[offset]; } @@ -71,8 +89,6 @@ class GenericTensorAccessorR { decltype(ptr) const &, decltype(device_type) const &> tie() const; - - size_t calculate_index_offset(std::vector const &indices) const; }; std::string format_as(GenericTensorAccessorR const &); @@ -109,7 +125,7 @@ class GenericTensorAccessorW { operator GenericTensorAccessorR() const; template - real_type_t
&at(std::vector const &indices) { + real_type_t
&at(std::vector const &indices) { if (this->device_type != DeviceType::CPU) { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } @@ -117,17 +133,37 @@ class GenericTensorAccessorW { throw mk_runtime_error(fmt::format( "Invalid access data type ({} != {})", this->data_type, DT)); } + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error(fmt::format("Number of indices ({}) does not " + "match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims())); + } using T = real_type_t
; T *data_ptr = static_cast(this->ptr); - size_t offset = calculate_index_offset(indices); + int offset = 0; + int multiplier = 1; + for (int i = 0; i < this->shape.num_dims(); i++) { + if (indices.at(i) >= this->shape.at(legion_dim_t{i})) { + throw mk_runtime_error( + fmt::format("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + indices.at(i), + this->shape.at(legion_dim_t{i}))); + } + + offset += indices.at(i) * multiplier; + multiplier *= this->shape.at(legion_dim_t{i}); + } return data_ptr[offset]; } template - real_type_t
&at(std::vector const &indices) const { + real_type_t
&at(std::vector const &indices) const { if (this->device_type != DeviceType::CPU) { throw mk_runtime_error("Calling at() on non-CPU allocated tensor"); } @@ -135,11 +171,31 @@ class GenericTensorAccessorW { throw mk_runtime_error(fmt::format( "Invalid access data type ({} != {})", this->data_type, DT)); } + if (indices.size() != this->shape.num_dims()) { + throw mk_runtime_error(fmt::format("Number of indices ({}) does not " + "match the number of dimensions ({}).", + indices.size(), + this->shape.num_dims())); + } using T = real_type_t
; T const *data_ptr = static_cast(this->ptr); - size_t offset = calculate_index_offset(indices); + int offset = 0; + int multiplier = 1; + for (int i = 0; i < this->shape.num_dims(); i++) { + if (indices.at(i) >= this->shape.at(legion_dim_t{i})) { + throw mk_runtime_error( + fmt::format("In {} dimension, attempting to access index {} " + "when only {} indexes exist", + i, + indices.at(i), + this->shape.at(legion_dim_t{i}))); + } + + offset += indices.at(i) * multiplier; + multiplier *= this->shape.at(legion_dim_t{i}); + } return data_ptr[offset]; } @@ -156,8 +212,6 @@ class GenericTensorAccessorW { decltype(ptr) const &, decltype(device_type) const &> tie() const; - - size_t calculate_index_offset(std::vector const &indices) const; }; std::string format_as(GenericTensorAccessorW const &); @@ -213,6 +267,21 @@ std::vector std::vector get_half_ptrs(std::vector const &); +int32_t *get_int32_ptr(GenericTensorAccessorW const &); +int64_t *get_int64_ptr(GenericTensorAccessorW const &); +float *get_float_ptr(GenericTensorAccessorW const &); +double *get_double_ptr(GenericTensorAccessorW const &); +half *get_half_ptr(GenericTensorAccessorW const &); +std::vector + get_int32_ptrs(std::vector const &); +std::vector + get_int64_ptrs(std::vector const &); +std::vector + get_float_ptrs(std::vector const &); +std::vector + get_double_ptrs(std::vector const &); +std::vector get_half_ptrs(std::vector const &); + template std::vector const *> get(std::vector const &accs) { @@ -239,14 +308,6 @@ std::pair void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor, GenericTensorAccessorR const &src_accessor); -GenericTensorAccessorR - copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, - Allocator &allocator); - -GenericTensorAccessorW - copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, - Allocator &allocator); - } // namespace FlexFlow namespace FlexFlow { diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h new file mode 100644 index 0000000000..da8af71e4f --- /dev/null +++ b/lib/kernels/include/kernels/copy_tensor_accessor.h @@ -0,0 +1,19 @@ +#ifndef _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H +#define _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H + +#include "kernels/accessor.h" +#include "kernels/allocation.h" + +namespace FlexFlow { + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator); + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/managed_ff_stream.h b/lib/kernels/include/kernels/managed_ff_stream.h index 2f690b2eb3..26d5fb4911 100644 --- a/lib/kernels/include/kernels/managed_ff_stream.h +++ b/lib/kernels/include/kernels/managed_ff_stream.h @@ -19,6 +19,8 @@ struct ManagedFFStream { ffStream_t const &raw_stream() const; + void cleanup(); + private: ffStream_t *stream; }; diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index f9f944c6ff..035ea574de 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -24,6 +24,8 @@ struct ManagedPerDeviceFFHandle { PerDeviceFFHandle const &raw_handle() const; + void cleanup(); + private: PerDeviceFFHandle *handle; }; diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index 4cb5bd83a2..e56bded737 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -26,7 +26,7 @@ void copy_accessor_data_to_l_from_r( dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost)); } else { assert(src_device_type == DeviceType::GPU); - assert(src_device_type == DeviceType::CPU); + assert(dst_device_type == DeviceType::GPU); checkCUDA(cudaMemcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes, @@ -53,36 +53,6 @@ std::tupledata_type, this->shape, this->ptr, this->device_type); } -size_t GenericTensorAccessorW::calculate_index_offset( - std::vector const &indices) const { - - if (indices.size() != this->shape.num_dims()) { - throw mk_runtime_error(fmt::format( - "Number of indices ({}) does not match the number of dimensions ({}).", - indices.size(), - this->shape.num_dims())); - } - - size_t offset = 0; - size_t multiplier = 1; - - for (size_t i = 0; i < this->shape.num_dims(); i++) { - if (indices[i] >= this->shape.at(legion_dim_t(i))) { - throw mk_runtime_error( - fmt::format("In {} dimension, attempting to access index {} " - "when only {} indexes exist", - i, - indices[i], - this->shape.at(legion_dim_t(i)))); - } - - offset += indices[i] * multiplier; - multiplier *= this->shape.at(legion_dim_t(i)); - } - - return offset; -} - bool GenericTensorAccessorW::operator==( GenericTensorAccessorW const &other) const { return this->tie() == other.tie(); @@ -139,36 +109,6 @@ std::tupledata_type, this->shape, this->ptr, this->device_type); } -size_t GenericTensorAccessorR::calculate_index_offset( - std::vector const &indices) const { - - if (indices.size() != this->shape.num_dims()) { - throw mk_runtime_error(fmt::format( - "Number of indices ({}) does not match the number of dimensions ({}).", - indices.size(), - this->shape.num_dims())); - } - - ssize_t offset = 0; - size_t multiplier = 1; - - for (size_t i = 0; i < this->shape.num_dims(); i++) { - if (indices[i] >= this->shape.at(legion_dim_t(i))) { - throw mk_runtime_error( - fmt::format("In {} dimension, attempting to access index {} " - "when only {} indexes exist", - i, - indices[i], - this->shape.at(legion_dim_t(i)))); - } - - offset += indices[i] * multiplier; - multiplier *= this->shape.at(legion_dim_t(i)); - } - - return offset; -} - bool GenericTensorAccessorR::operator==( GenericTensorAccessorR const &other) const { return this->tie() == other.tie(); @@ -280,46 +220,4 @@ std::pair return std::make_pair(accessor.shape, accessor.data_type); } -template -struct CopyTensorAccessorW { - GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, - Allocator &allocator) { - TensorShape shape = - get_tensor_shape(src_accessor.shape, src_accessor.data_type); - GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); - - copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); - - return dst_accessor; - } -}; - -GenericTensorAccessorW - copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, - Allocator &allocator) { - return DataTypeDispatch1{}( - src_accessor.data_type, src_accessor, allocator); -} - -template -struct CopyTensorAccessorR { - GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor, - Allocator &allocator) { - TensorShape shape = - get_tensor_shape(src_accessor.shape, src_accessor.data_type); - GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); - - copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); - - return read_only_accessor_from_write_accessor(dst_accessor); - } -}; - -GenericTensorAccessorR - copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, - Allocator &allocator) { - return DataTypeDispatch1{}( - src_accessor.data_type, src_accessor, allocator); -} - } // namespace FlexFlow diff --git a/lib/kernels/src/copy_tensor_accessor.cc b/lib/kernels/src/copy_tensor_accessor.cc new file mode 100644 index 0000000000..6a3ad8033a --- /dev/null +++ b/lib/kernels/src/copy_tensor_accessor.cc @@ -0,0 +1,48 @@ +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow { + +template +struct CopyTensorAccessorW { + GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return dst_accessor; + } +}; + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, allocator); +} + +template +struct CopyTensorAccessorR { + GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return read_only_accessor_from_write_accessor(dst_accessor); + } +}; + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, allocator); +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/cpu/replicate_kernels.cc b/lib/kernels/src/cpu/replicate_kernels.cc index 25693b374d..cfcb44dac5 100644 --- a/lib/kernels/src/cpu/replicate_kernels.cc +++ b/lib/kernels/src/cpu/replicate_kernels.cc @@ -19,9 +19,9 @@ struct CPUBackwardKernel { GenericTensorAccessorW &input, size_t num_replicas) { using T = real_type_t
; - for (size_t i = 0; i < input.shape.num_elements(); i++) { + for (int i = 0; i < input.shape.num_elements(); i++) { T cur_sum = 0; - for (size_t j = 0; j < num_replicas; j++) { + for (int j = 0; j < num_replicas; j++) { cur_sum += output.at
({i, j}); } input.at
({i}) = cur_sum; diff --git a/lib/kernels/src/cpu/reverse_kernels.cc b/lib/kernels/src/cpu/reverse_kernels.cc index e5b3719d74..bc73c80e9e 100644 --- a/lib/kernels/src/cpu/reverse_kernels.cc +++ b/lib/kernels/src/cpu/reverse_kernels.cc @@ -11,13 +11,13 @@ struct CPUReverseForwardKernel { GenericTensorAccessorW &output) { assert(input.data_type == DT && output.data_type == DT); - size_t num_out_blocks = input.shape.at(legion_dim_t(0)); - size_t reverse_dim_size = input.shape.at(legion_dim_t(1)); - size_t in_block_size = input.shape.at(legion_dim_t(2)); + int num_out_blocks = input.shape.at(legion_dim_t(0)); + int reverse_dim_size = input.shape.at(legion_dim_t(1)); + int in_block_size = input.shape.at(legion_dim_t(2)); - for (size_t block_idx = 0; block_idx < num_out_blocks; block_idx++) { - for (size_t rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { - for (size_t i = 0; i < in_block_size; i++) { + for (int block_idx = 0; block_idx < num_out_blocks; block_idx++) { + for (int rev_idx = 0; rev_idx < reverse_dim_size; rev_idx++) { + for (int i = 0; i < in_block_size; i++) { output.at
({block_idx, rev_idx, i}) = input.at
({num_out_blocks - 1 - block_idx, reverse_dim_size - 1 - rev_idx, diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu index f13ebee67e..6b069218fa 100644 --- a/lib/kernels/src/cuda/ops/linear_kernels.cu +++ b/lib/kernels/src/cuda/ops/linear_kernels.cu @@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream, batch_size, in_dim, &alpha, - reinterpret_cast(weight_ptr), + static_cast(weight_ptr), weight_type, in_dim, - reinterpret_cast(input_ptr), + static_cast(input_ptr), input_type, in_dim, &beta, - reinterpret_cast(output_ptr), + static_cast(output_ptr), output_type, out_dim, compute_type, @@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream, batch_size, 1, &alpha, - reinterpret_cast(bias_ptr), + static_cast(bias_ptr), weight_type, 1, - reinterpret_cast(m.one_ptr), + static_cast(m.one_ptr), CUDA_R_32F, 1, &alpha, - reinterpret_cast(output_ptr), + static_cast(output_ptr), output_type, out_dim, compute_type, @@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream, m.actiDesc, &alpha, m.outputTensor, - reinterpret_cast(output_ptr), + static_cast(output_ptr), &beta, m.outputTensor, - reinterpret_cast(output_ptr))); + static_cast(output_ptr))); } else if (m.activation == Activation::GELU) { size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size); constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) @@ -217,14 +217,14 @@ void backward_kernel(cudaStream_t stream, if (m.activation.has_value()) { if (m.activation == Activation::RELU) { relu_backward_kernel(m.output_type, - reinterpret_cast(output_grad_ptr), - reinterpret_cast(output_ptr), + static_cast(output_grad_ptr), + static_cast(output_ptr), output_size, stream); } else if (m.activation == Activation::SIGMOID) { sigmoid_backward_kernel(m.output_type, - reinterpret_cast(output_grad_ptr), - reinterpret_cast(output_ptr), + static_cast(output_grad_ptr), + static_cast(output_ptr), output_size, stream); } else { @@ -241,14 +241,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - reinterpret_cast(input_ptr), + static_cast(input_ptr), input_type, in_dim, - reinterpret_cast(output_grad_ptr), + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - reinterpret_cast(kernel_grad_ptr), + static_cast(kernel_grad_ptr), weight_type, in_dim, compute_type, @@ -290,14 +290,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - reinterpret_cast(m.one_ptr), + static_cast(m.one_ptr), CUDA_R_32F, 1, - reinterpret_cast(output_grad_ptr), + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - reinterpret_cast(bias_grad_ptr), + static_cast(bias_grad_ptr), weight_type, 1, compute_type, @@ -313,14 +313,14 @@ void backward_kernel(cudaStream_t stream, batch_size, out_dim, &alpha, - reinterpret_cast(kernel_ptr), + static_cast(kernel_ptr), weight_type, in_dim, - reinterpret_cast(output_grad_ptr), + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - reinterpret_cast(input_grad_ptr), + static_cast(input_grad_ptr), input_type, in_dim, compute_type, diff --git a/lib/kernels/src/managed_ff_stream.cc b/lib/kernels/src/managed_ff_stream.cc index a8b44dc1d3..f0348aa91c 100644 --- a/lib/kernels/src/managed_ff_stream.cc +++ b/lib/kernels/src/managed_ff_stream.cc @@ -12,16 +12,17 @@ ManagedFFStream::ManagedFFStream(ManagedFFStream &&other) noexcept ManagedFFStream &ManagedFFStream::operator=(ManagedFFStream &&other) noexcept { if (this != &other) { - if (this->stream != nullptr) { - checkCUDA(cudaStreamDestroy(*this->stream)); - delete stream; - } + this->cleanup(); this->stream = std::exchange(other.stream, nullptr); } return *this; } ManagedFFStream::~ManagedFFStream() { + this->cleanup(); +} + +void ManagedFFStream::cleanup() { if (this->stream != nullptr) { checkCUDA(cudaStreamDestroy(*this->stream)); delete this->stream; diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index 5bd49dc26f..9f1737240e 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -5,7 +5,7 @@ namespace FlexFlow { ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( size_t workSpaceSize, bool allowTensorOpMathConversion) { - this->handle = new PerDeviceFFHandle; + this->handle = new PerDeviceFFHandle{}; this->handle->workSpaceSize = workSpaceSize; this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion; @@ -21,18 +21,17 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( ManagedPerDeviceFFHandle &ManagedPerDeviceFFHandle::operator=( ManagedPerDeviceFFHandle &&other) noexcept { if (this != &other) { - if (this->handle != nullptr) { - checkCUDNN(cudnnDestroy(this->handle->dnn)); - checkCUBLAS(cublasDestroy(this->handle->blas)); - checkCUDA(cudaFree(this->handle->workSpace)); - delete this->handle; - } + this->cleanup(); this->handle = std::exchange(other.handle, nullptr); } return *this; } ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() { + this->cleanup(); +} + +void ManagedPerDeviceFFHandle::cleanup() { if (this->handle != nullptr) { checkCUDNN(cudnnDestroy(this->handle->dnn)); checkCUBLAS(cublasDestroy(this->handle->blas)); diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index aae3676107..023233ecb0 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -13,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) { size_t qoSeqLength = 20, kvSeqLength = 20; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index b87f3978b5..8a11a069f5 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -15,7 +15,9 @@ TEST_SUITE(FF_TEST_SUITE) { size_t seq_length = -1; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index a258a27a34..611069ac93 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/batch_norm_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -9,7 +10,9 @@ TEST_SUITE(FF_TEST_SUITE) { size_t output_n = 1, output_c = 10, output_h = 10, output_w = 10; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -36,12 +39,12 @@ TEST_SUITE(FF_TEST_SUITE) { create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); - GenericTensorAccessorW scale_accessor = - create_filled_accessor_w(scale_shape, allocator, DataTypeValue(1.0f)); + GenericTensorAccessorW scale_accessor = create_filled_accessor_w( + scale_shape, allocator, make_float_data_type_value(1)); SUBCASE("forward_kernel") { - GenericTensorAccessorW bias_accessor = - create_filled_accessor_w(bias_shape, allocator, DataTypeValue(0.0f)); + GenericTensorAccessorW bias_accessor = create_filled_accessor_w( + bias_shape, allocator, make_float_data_type_value(0)); Kernels::BatchNorm::forward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 60179ee75b..a4688a1030 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -6,7 +6,9 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Call Combine Forward and Backward Kernels") { - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 841d53133c..b299f5dea8 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -8,9 +8,11 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test concat kernel forward and backward") { size_t num_inputs = 2; size_t size_per_input = 10; - ff_dim_t concat_axis = ff_dim_t(1); + ff_dim_t concat_axis = ff_dim_t{1}; - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; TensorShape input_shape = diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index bee00d990d..4be2bdf7bb 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -18,7 +18,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 9febf4bcc4..b8f128b761 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/flat_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -7,7 +8,9 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Flat Kernel") { Allocator allocator = create_local_cuda_memory_allocator(); - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; TensorShape input_shape = @@ -16,7 +19,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor(create_filled_accessor_w( - input_shape, allocator, DataTypeValue(2.0f))); + input_shape, allocator, make_float_data_type_value(2))); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -31,9 +34,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w( - output_shape, allocator, DataTypeValue(0.0f)); - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, DataTypeValue(1.0f)); + output_shape, allocator, make_float_data_type_value(0)); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(1)); Kernels::Flat::backward_kernel(managed_stream.raw_stream(), input_accessor, diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 4f9fa02a1a..7f97563217 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -5,7 +5,9 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Gather Forward and Backward Kernel") { - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 87fc88f081..7d7298f83d 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/layer_norm_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -17,7 +18,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape feature_shape = make_tensor_shape_from_legion_dims({feature_size}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -32,14 +35,14 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR input_accessor = create_random_filled_accessor_r(input_shape, allocator); - GenericTensorAccessorW gamma_accessor = - create_filled_accessor_w(feature_shape, allocator, DataTypeValue(1.0f)); + GenericTensorAccessorW gamma_accessor = create_filled_accessor_w( + feature_shape, allocator, make_float_data_type_value(1)); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); GenericTensorAccessorW beta_accessor = create_filled_accessor_w( - feature_shape, allocator, DataTypeValue(0.0f)); + feature_shape, allocator, make_float_data_type_value(0)); Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc index ce8a808454..605aa6ffa1 100644 --- a/lib/kernels/test/src/test_managed_ff_stream.cc +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -4,26 +4,28 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test Managed FF Stream") { + TEST_CASE("ManagedFFStream") { ManagedFFStream base_stream{}; ffStream_t const *base_stream_ptr = &base_stream.raw_stream(); - SUBCASE("Test ManagedFFStream Move Constructor") { + SUBCASE("move constructor") { ManagedFFStream new_stream(std::move(base_stream)); CHECK(&base_stream.raw_stream() == nullptr); CHECK(&new_stream.raw_stream() == base_stream_ptr); } - SUBCASE("Test ManagedFFStream Assignment Operator") { - ManagedFFStream new_stream{}; - new_stream = std::move(base_stream); - CHECK(&base_stream.raw_stream() == nullptr); - CHECK(&new_stream.raw_stream() == base_stream_ptr); - } + SUBCASE("move assignment operator") { + SUBCASE("move assign to other") { + ManagedFFStream new_stream{}; + new_stream = std::move(base_stream); + CHECK(&base_stream.raw_stream() == nullptr); + CHECK(&new_stream.raw_stream() == base_stream_ptr); + } - SUBCASE("Test Self-Assignment") { - base_stream = std::move(base_stream); - CHECK(&base_stream.raw_stream() == base_stream_ptr); + SUBCASE("move assign to self") { + base_stream = std::move(base_stream); + CHECK(&base_stream.raw_stream() == base_stream_ptr); + } } } } diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc index d39da03ba9..de3e5b72b1 100644 --- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -4,33 +4,35 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test Managed Per Device FF Handle") { + TEST_CASE("ManagedPerDeviceFFHandle") { ManagedPerDeviceFFHandle base_handle{1024 * 1024, true}; PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); - SUBCASE("Test ManagedPerDeviceFFHandle Constructor") { + SUBCASE("constructor") { CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024); CHECK(base_handle.raw_handle().allowTensorOpMathConversion == true); } - SUBCASE("Test ManagedPerDeviceFFHandle Move Constructor") { + SUBCASE("move constructor") { ManagedPerDeviceFFHandle new_handle(std::move(base_handle)); CHECK(&base_handle.raw_handle() == nullptr); CHECK(&new_handle.raw_handle() == base_handle_ptr); } - SUBCASE("Test ManagedPerDeviceFFHandle Assignment Operator") { - ManagedPerDeviceFFHandle new_handle{1024 * 1024, true}; - new_handle = std::move(base_handle); + SUBCASE("move assignment operator") { + SUBCASE("move assign to other") { + ManagedPerDeviceFFHandle new_handle{1024 * 1024, true}; + new_handle = std::move(base_handle); - CHECK(&base_handle.raw_handle() == nullptr); - CHECK(&new_handle.raw_handle() == base_handle_ptr); - } + CHECK(&base_handle.raw_handle() == nullptr); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } - SUBCASE("Test Self-Assignment") { - base_handle = std::move(base_handle); - CHECK(&base_handle.raw_handle() == base_handle_ptr); + SUBCASE("move assign to self") { + base_handle = std::move(base_handle); + CHECK(&base_handle.raw_handle() == base_handle_ptr); + } } } } diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 079af64a8c..4beae62553 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -1,12 +1,15 @@ #include "doctest/doctest.h" #include "kernels/partition_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Partition Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -19,8 +22,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - create_filled_accessor_r(input_shape, allocator, DataTypeValue(1.0f)); + GenericTensorAccessorR input_accessor = create_filled_accessor_r( + input_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -32,9 +35,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( - output_shape, allocator, DataTypeValue(1.0f)); - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, DataTypeValue(2.0f)); + output_shape, allocator, make_float_data_type_value(1)); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(2)); Kernels::Repartition::backward_kernel(managed_stream.raw_stream(), state, diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 76b966ea15..2a4d3caf9a 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/pool_2d_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -12,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) { PoolOp pool_type = PoolOp::MAX; - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -57,7 +60,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w( - output_shape, allocator, DataTypeValue(1.0f)); + output_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index ddbe826c70..3c3e828049 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/reduction_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -10,7 +11,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = make_tensor_shape_from_legion_dims( {10, 10, 10, 10, 10}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -36,7 +39,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = input_shape; GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( - output_shape, allocator, DataTypeValue(1.0f)); + output_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 1d9e0677b7..27223cc7b5 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -13,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = make_tensor_shape_from_legion_dims({100}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -53,7 +55,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = make_tensor_shape_from_legion_dims({5, num_replicas}, DataType::FLOAT); - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator gpu_allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index 41aaac9c3e..55797aeff6 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -5,7 +5,9 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Reshape Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 436b788a99..4adf79847a 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -1,6 +1,7 @@ #include "doctest/doctest.h" #include "kernels/reverse_kernels.h" #include "kernels/reverse_kernels_cpu.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -14,7 +15,9 @@ TEST_SUITE(FF_TEST_SUITE) { {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -22,7 +25,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor(create_filled_accessor_w( - input_shape, allocator, DataTypeValue(1.0f))); + input_shape, allocator, make_float_data_type_value(1))); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -65,7 +68,9 @@ TEST_SUITE(FF_TEST_SUITE) { {num_out_blks, reverse_dim_size, in_blk_size}, DataType::FLOAT); TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator gpu_allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index b293d1ce75..bb6bcb949b 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -8,7 +8,9 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Softmax Kernel Operations") { int input_n = 1, input_c = 1, input_h = 1, input_w = 100, channels = 100; - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index 114077d6ec..34993fa151 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -1,5 +1,6 @@ #include "doctest/doctest.h" #include "kernels/split_kernels.h" +#include "op-attrs/make_datatype_value.h" #include "test_utils.h" #include "utils/containers/repeat.h" @@ -12,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) { coord_t in_blk_size = 100; coord_t num_blks = 1; - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -49,8 +52,8 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, DataTypeValue(0.0f)); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(0)); Kernels::Split::backward_kernel(managed_stream.raw_stream(), input_grad_accessor.get_float_ptr(), diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index 5c5e9b31f8..b9ef82a764 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -7,9 +7,11 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Transpose Kernel Operations") { std::size_t num_dims = 2; - std::vector perm = {ff_dim_t(0), ff_dim_t(1)}; + std::vector perm = {ff_dim_t{0}, ff_dim_t{1}}; - ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc index a59747b376..bfed1241ba 100644 --- a/lib/kernels/test/src/test_utils.cc +++ b/lib/kernels/test/src/test_utils.cc @@ -137,25 +137,35 @@ GenericTensorAccessorW } template -struct PrintCPUAccessorR { +struct Print2DCPUAccessorR { void operator()(GenericTensorAccessorR const &accessor, std::ostream &stream) { using T = real_type_t
; T const *data_ptr = accessor.get
(); - for (size_t i = 0; i < accessor.shape.num_elements(); i++) { - stream << data_ptr[i] << " "; + int rows = accessor.shape.at(legion_dim_t{0}); + int cols = accessor.shape.at(legion_dim_t{1}); + + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + stream << data_ptr[i * cols + j]; + + if (j < cols - 1) { + stream << " "; + } + } + stream << std::endl; } - stream << "\n"; } }; -void print_tensor_accessor_contents(GenericTensorAccessorR const &accessor, - std::ostream &stream) { +void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor, + std::ostream &stream) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorR cpu_accessor = copy_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); - DataTypeDispatch1{}(accessor.data_type, accessor, stream); + DataTypeDispatch1{}( + accessor.data_type, accessor, stream); } template diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h index efbbc90e08..d23b936cb0 100644 --- a/lib/kernels/test/src/test_utils.h +++ b/lib/kernels/test/src/test_utils.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_TEST_UTILS #define _FLEXFLOW_KERNELS_TEST_UTILS +#include "kernels/copy_tensor_accessor.h" #include "kernels/datatype_dispatch.h" #include "kernels/device.h" #include "kernels/local_cpu_allocator.h" @@ -37,7 +38,7 @@ GenericTensorAccessorR copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor, Allocator &allocator); -void print_tensor_accessor_contents(GenericTensorAccessorR const &accessor); +void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor); bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, GenericTensorAccessorR const &accessor_b); diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc index 33d62b713c..be51ea9526 100644 --- a/lib/local-execution/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -30,14 +30,14 @@ static DeviceSpecificDeviceStates auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - int input_w = input.shape.at(ff_dim_t(0)) + 1; - int input_h = input.shape.at(ff_dim_t(1)) + 1; - int input_c = input.shape.at(ff_dim_t(2)) + 1; - int input_n = input.shape.at(ff_dim_t(3)) + 1; - int output_w = output.shape.at(ff_dim_t(0)) + 1; - int output_h = output.shape.at(ff_dim_t(1)) + 1; - int output_c = output.shape.at(ff_dim_t(2)) + 1; - int output_n = output.shape.at(ff_dim_t(3)) + 1; + int input_w = input.shape.at(ff_dim_t{0}) + 1; + int input_h = input.shape.at(ff_dim_t{1}) + 1; + int input_c = input.shape.at(ff_dim_t{2}) + 1; + int input_n = input.shape.at(ff_dim_t{3}) + 1; + int output_w = output.shape.at(ff_dim_t{0}) + 1; + int output_h = output.shape.at(ff_dim_t{1}) + 1; + int output_c = output.shape.at(ff_dim_t{2}) + 1; + int output_n = output.shape.at(ff_dim_t{3}) + 1; printf("init pool (input): n(%d) c(%d) h(%d) " "w(%d)\n", diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc index 366a579bea..bb1b802edd 100644 --- a/lib/local-execution/src/ops/reverse.cc +++ b/lib/local-execution/src/ops/reverse.cc @@ -53,11 +53,11 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1; for (int i = 0; i < output.shape.get_dim(); i++) { if (i < axis.value) { - in_blk_size *= output.shape.at(ff_dim_t(i)); + in_blk_size *= output.shape.at(ff_dim_t{i}); } else if (i == axis.value) { - reverse_dim_size = output.shape.at(ff_dim_t(i)); + reverse_dim_size = output.shape.at(ff_dim_t{i}); } else { - num_out_blks *= output.shape.at(ff_dim_t(i)); + num_out_blks *= output.shape.at(ff_dim_t{i}); } } @@ -83,11 +83,11 @@ static std::optional coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1; for (int i = 0; i < input_grad.shape.get_dim(); i++) { if (i < axis) { - in_blk_size *= input_grad.shape.at(ff_dim_t(i)); + in_blk_size *= input_grad.shape.at(ff_dim_t{i}); } else if (i == axis) { - reverse_dim_size = input_grad.shape.at(ff_dim_t(i)); + reverse_dim_size = input_grad.shape.at(ff_dim_t{i}); } else { - num_out_blks *= input_grad.shape.at(ff_dim_t(i)); + num_out_blks *= input_grad.shape.at(ff_dim_t{i}); } } diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index 788ab52a7a..512c1ef33b 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -12,7 +12,11 @@ // TEST_SUITE(FF_CUDA_TEST_SUITE) { // TEST_CASE("Local Cost Estimator") { // // local backing initialization -// ManagedPerDeviceFFHandle managed_handle(1024 * 1024, true); +// ManagedPerDeviceFFHandle managed_handle{ +/*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true +} +; // RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ // DeviceSpecific::create(managed_handle.raw_handle()), diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h index 6aa23d40fc..19a6e62178 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h +++ b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h @@ -175,8 +175,9 @@ auto inner_to_outer(FFOrdered const &ff_ordered) template std::vector inner_to_outer_idxs(FFOrdered const &ff_ordered) { std::vector idxs; - for (size_t i = 0; i < ff_ordered.size(); i++) { - idxs.push_back(ff_dim_t(ff_ordered.size() - i - 1)); + int size = static_cast(ff_ordered.size()); + for (int i = 0; i < ff_ordered.size(); i++) { + idxs.push_back(ff_dim_t{size - i - 1}); } return idxs; } diff --git a/lib/op-attrs/include/op-attrs/make_datatype_value.h b/lib/op-attrs/include/op-attrs/make_datatype_value.h new file mode 100644 index 0000000000..c3289c6309 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/make_datatype_value.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H + +#include "op-attrs/datatype_value.dtg.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value); +DataTypeValue make_double_data_type_value(double value); +DataTypeValue make_int32_data_type_value(int32_t value); +DataTypeValue make_int64_data_type_value(int64_t value); +DataTypeValue make_bool_data_type_value(bool value); + +} + +#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H diff --git a/lib/op-attrs/src/op-attrs/make_datatype_value.cc b/lib/op-attrs/src/op-attrs/make_datatype_value.cc new file mode 100644 index 0000000000..bc402c433c --- /dev/null +++ b/lib/op-attrs/src/op-attrs/make_datatype_value.cc @@ -0,0 +1,25 @@ +#include "op-attrs/make_datatype_value.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value) { + return DataTypeValue{value}; +} + +DataTypeValue make_double_data_type_value(double value) { + return DataTypeValue{value}; +} + +DataTypeValue make_int32_data_type_value(int32_t value) { + return DataTypeValue{value}; +} + +DataTypeValue make_int64_data_type_value(int64_t value) { + return DataTypeValue{value}; +} + +DataTypeValue make_bool_data_type_value(bool value) { + return DataTypeValue{value}; +} + +} diff --git a/lib/op-attrs/src/op-attrs/ops/attention.cc b/lib/op-attrs/src/op-attrs/ops/attention.cc index 483d832fee..8a806bcf9f 100644 --- a/lib/op-attrs/src/op-attrs/ops/attention.cc +++ b/lib/op-attrs/src/op-attrs/ops/attention.cc @@ -33,15 +33,15 @@ int get_oProjSize(MultiHeadAttentionAttrs const &attrs) { } int get_qSize(TensorShape const &query_shape) { - return dim_at_idx(query_shape, ff_dim_t(0)); + return dim_at_idx(query_shape, ff_dim_t{0}); } int get_kSize(TensorShape const &key_shape) { - return dim_at_idx(key_shape, ff_dim_t(0)); + return dim_at_idx(key_shape, ff_dim_t{0}); } int get_vSize(TensorShape const &value_shape) { - return dim_at_idx(value_shape, ff_dim_t(0)); + return dim_at_idx(value_shape, ff_dim_t{0}); } int get_qSize(MultiHeadAttentionParallelInputs const &inputs) { diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc index dcc567e0ca..6ea29b1855 100644 --- a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc +++ b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc @@ -138,7 +138,7 @@ std::unordered_set get_parallel_tensor_dim_indices(ParallelTensorShape const &shape) { std::unordered_set indices; extend(indices, transform(range(num_shard_dims(shape.dims)), [](int idx) { - return parallel_tensor_dim_idx_t(ff_dim_t(idx)); + return parallel_tensor_dim_idx_t(ff_dim_t{idx}); })); indices.insert(parallel_tensor_dim_idx_t(ReplicaType::SUM)); indices.insert(parallel_tensor_dim_idx_t(ReplicaType::DISCARD_COPY)); diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc index dff647f5a1..65ef214669 100644 --- a/lib/pcg/src/pcg/computation_graph_builder.cc +++ b/lib/pcg/src/pcg/computation_graph_builder.cc @@ -3,6 +3,7 @@ #include "op-attrs/get_incoming_tensor_roles.h" #include "op-attrs/get_op_type.h" #include "op-attrs/get_output_shapes.h" +#include "op-attrs/make_datatype_value.h" #include "op-attrs/ops/attention.h" #include "op-attrs/ops/batch_norm.h" #include "op-attrs/ops/broadcast.h" @@ -609,14 +610,14 @@ tensor_guid_t ComputationGraphBuilder::batch_norm( TensorShape gamma_shape = throw_if_unexpected(get_gamma_weights_shape(attrs, input_shape)); - InitializerAttrs gamma_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}}; + InitializerAttrs gamma_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(1)}}; weights.push_back(make_weight_attrs(gamma_shape, gamma_initializer)); TensorShape beta_shape = throw_if_unexpected(get_beta_weights_shape(attrs, input_shape)); - InitializerAttrs beta_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}}; + InitializerAttrs beta_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(0)}}; weights.push_back(make_weight_attrs(beta_shape, beta_initializer)); } @@ -688,8 +689,8 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention( get_input_bias_shape(attrs, query_shape, key_shape, value_shape)); // initializer chosen based on // https://github.com/pytorch/pytorch/blob/31c4e0d37d8efc37a0697159e5b9121ec34d5141/torch/nn/modules/activation.py#L1120-L1121 - InitializerAttrs input_bias_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}}; + InitializerAttrs input_bias_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(0)}}; weights.push_back( make_weight_attrs(input_bias_shape, input_bias_initializer)); @@ -698,8 +699,8 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention( get_output_bias_shape(attrs, query_shape, key_shape, value_shape)); // initializer chosen based on // https://github.com/pytorch/pytorch/blob/31c4e0d37d8efc37a0697159e5b9121ec34d5141/torch/nn/modules/activation.py#L1120-L1121 - InitializerAttrs output_bias_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}}; + InitializerAttrs output_bias_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(0)}}; weights.push_back( make_weight_attrs(output_bias_shape, output_bias_initializer)); @@ -870,14 +871,14 @@ tensor_guid_t ComputationGraphBuilder::layer_norm( TensorShape gamma_shape = throw_if_unexpected(get_gamma_weights_shape(attrs, input_shape)); - InitializerAttrs gamma_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}}; + InitializerAttrs gamma_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(1)}}; weights.push_back(make_weight_attrs(gamma_shape, gamma_initializer)); TensorShape beta_shape = throw_if_unexpected(get_beta_weights_shape(attrs, input_shape)); - InitializerAttrs beta_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}}; + InitializerAttrs beta_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(0)}}; weights.push_back(make_weight_attrs(beta_shape, beta_initializer)); } diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc index f33b4dcd17..79ac43ae66 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc @@ -1,5 +1,6 @@ #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" #include "op-attrs/get_incoming_tensor_roles.h" +#include "op-attrs/make_datatype_value.h" #include "op-attrs/ops/attention.h" #include "op-attrs/ops/batch_matmul.h" #include "op-attrs/ops/batch_norm.h" @@ -385,14 +386,14 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::batch_norm( ParallelTensorShape gamma_shape = throw_if_unexpected(get_gamma_weights_shape(attrs, input_shape)); - InitializerAttrs gamma_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{1}}}}; + InitializerAttrs gamma_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(1)}}; weights.push_back(make_weight_attrs(gamma_shape, gamma_initializer)); ParallelTensorShape beta_shape = throw_if_unexpected(get_beta_weights_shape(attrs, input_shape)); - InitializerAttrs beta_initializer = - InitializerAttrs{ConstantInitializerAttrs{DataTypeValue{float{0}}}}; + InitializerAttrs beta_initializer = InitializerAttrs{ + ConstantInitializerAttrs{make_float_data_type_value(0)}}; weights.push_back(make_weight_attrs(beta_shape, beta_initializer)); } diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc index 2370739d58..296b9f443b 100644 --- a/lib/runtime/src/ops/embedding.cc +++ b/lib/runtime/src/ops/embedding.cc @@ -85,7 +85,7 @@ static std::optional attrs.aggr, input.shape.get_dim(), output.shape.get_dim(), - input.shape.at(ff_dim_t(0))); + input.shape.at(ff_dim_t{0})); } TaskImplFunction get_embedding_fwd_task_impl() { From 42f1fce9e1f9b5fa1cafbd01edd329d1a6b7e38e Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Thu, 21 Nov 2024 22:16:51 -0800 Subject: [PATCH 19/20] issue #1502 & issue #1540 --- lib/kernels/CMakeLists.txt | 3 +- .../include/kernels/batch_norm_kernels.h | 4 +- lib/kernels/include/kernels/cast_kernels.h | 4 +- .../include/kernels/cast_kernels_cpu.h | 4 +- lib/kernels/include/kernels/conv_2d_kernels.h | 4 +- .../include/kernels/element_unary_kernels.h | 6 +- .../include/kernels/embedding_kernels.h | 4 +- lib/kernels/include/kernels/flat_kernels.h | 7 +- lib/kernels/include/kernels/linear_kernels.h | 4 +- .../include/kernels/loss_function_kernels.h | 2 +- lib/kernels/include/kernels/metrics_kernels.h | 29 +- .../include/kernels/optimizer_kernels.h | 124 ++-- .../include/kernels/partition_kernels.h | 4 +- .../kernels}/per_device_op_state.variant.toml | 0 lib/kernels/include/kernels/pool_2d_kernels.h | 9 +- .../include/kernels/reduction_kernels.h | 4 +- lib/kernels/include/kernels/reshape_kernels.h | 4 +- lib/kernels/include/kernels/softmax_kernels.h | 2 +- .../include/kernels/transpose_kernels.h | 4 +- lib/kernels/src/cpu/cast_kernels.cc | 14 +- lib/kernels/src/cuda/cuda_helper.cu | 12 +- lib/kernels/src/cuda/embedding_kernels.cu | 549 ++++++++++++++---- lib/kernels/src/cuda/metrics_functions.cu | 101 ++-- .../src/cuda/ops/batch_norm_kernels.cu | 4 +- lib/kernels/src/cuda/ops/cast_kernels.cu | 14 +- lib/kernels/src/cuda/ops/conv_2d_kernels.cu | 4 +- .../src/cuda/ops/element_unary_kernels.cu | 18 +- lib/kernels/src/cuda/ops/flat_kernels.cu | 4 +- lib/kernels/src/cuda/ops/linear_kernels.cu | 4 +- lib/kernels/src/cuda/ops/partition_kernels.cu | 10 +- lib/kernels/src/cuda/ops/pool_2d_kernels.cu | 6 +- lib/kernels/src/cuda/ops/reduction_kernels.cu | 10 +- lib/kernels/src/cuda/ops/reshape_kernels.cu | 10 +- lib/kernels/src/cuda/ops/softmax_kernels.cu | 2 +- lib/kernels/src/cuda/ops/transpose_kernels.cu | 4 +- ...timizer_kernel.cu => optimizer_kernels.cu} | 57 +- .../test/src/test_batch_norm_kernel.cc | 4 +- lib/kernels/test/src/test_flat_kernel.cc | 6 +- lib/kernels/test/src/test_partition_kernel.cc | 4 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 6 +- lib/kernels/test/src/test_reduction_kernel.cc | 4 +- lib/kernels/test/src/test_reshape_kernel.cc | 4 +- lib/kernels/test/src/test_softmax_kernel.cc | 2 +- lib/kernels/test/src/test_transpose_kernel.cc | 4 +- .../local-execution/per_device_op_state.h | 2 +- .../local-execution/task_argument_accessor.h | 2 +- lib/local-execution/src/ops/batch_norm.cc | 4 +- lib/local-execution/src/ops/conv_2d.cc | 6 +- lib/local-execution/src/ops/element_unary.cc | 10 +- lib/local-execution/src/ops/flat.cc | 6 +- lib/local-execution/src/ops/linear.cc | 4 +- lib/local-execution/src/ops/pool_2d.cc | 10 +- lib/local-execution/src/ops/reduction.cc | 6 +- lib/local-execution/src/ops/repartition.cc | 4 +- lib/local-execution/src/ops/reshape.cc | 4 +- lib/local-execution/src/ops/softmax.cc | 2 +- lib/local-execution/src/ops/transpose.cc | 4 +- ...device_state.cc => per_device_op_state.cc} | 0 .../include/op-attrs/aggregate_op.enum.toml | 5 +- .../include/op-attrs/datatype_value.h | 16 + .../include/op-attrs/make_datatype_value.h | 2 +- .../src/op-attrs/make_datatype_value.cc | 10 +- lib/pcg/include/pcg/metric.h | 73 +++ lib/pcg/src/pcg/metric.cc | 38 ++ lib/runtime/src/metrics_functions.cc | 33 -- lib/runtime/src/metrics_functions.h | 63 +- lib/runtime/src/ops/embedding.cc | 4 +- 67 files changed, 934 insertions(+), 453 deletions(-) rename lib/{local-execution/include/local-execution => kernels/include/kernels}/per_device_op_state.variant.toml (100%) rename lib/kernels/src/cuda/{optimizer_kernel.cu => optimizer_kernels.cu} (80%) rename lib/local-execution/src/{per_device_state.cc => per_device_op_state.cc} (100%) create mode 100644 lib/op-attrs/include/op-attrs/datatype_value.h create mode 100644 lib/pcg/include/pcg/metric.h create mode 100644 lib/pcg/src/pcg/metric.cc diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index fc91b7d3db..f5d88f102f 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -7,8 +7,7 @@ file(GLOB_RECURSE SRC CONFIGURE_DEPENDS LIST_DIRECTORIES False src/*.cc - src/cuda/cuda_helper.cu - src/cuda/ops/*.cu + src/cuda/*.cu ) add_library( diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h index 4de6ac6af0..3fea92c86b 100644 --- a/lib/kernels/include/kernels/batch_norm_kernels.h +++ b/lib/kernels/include/kernels/batch_norm_kernels.h @@ -63,9 +63,9 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, BatchNormPerDeviceState const &m, - float const *input_ptr, - float *output_grad_ptr, float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, float *input_grad_ptr, float const *scale_ptr, float *scale_grad_ptr, diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h index 21e76fed1d..da13e0036d 100644 --- a/lib/kernels/include/kernels/cast_kernels.h +++ b/lib/kernels/include/kernels/cast_kernels.h @@ -11,8 +11,8 @@ void forward_kernel(ffStream_t stream, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h index 275476b4e6..a5df80d4da 100644 --- a/lib/kernels/include/kernels/cast_kernels_cpu.h +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -9,8 +9,8 @@ namespace FlexFlow::Kernels::Cast { void cpu_forward_kernel(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); -void cpu_backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index 217751e191..f49c8f50f4 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -60,10 +60,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *filter_ptr, float *filter_grad_ptr, float *bias_grad_ptr, diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h index 26ce4ecaec..c338f465ac 100644 --- a/lib/kernels/include/kernels/element_unary_kernels.h +++ b/lib/kernels/include/kernels/element_unary_kernels.h @@ -36,10 +36,10 @@ void backward_kernel(ffStream_t stream, ElementUnaryPerDeviceState const &device_state, ElementUnaryAttrs const &attrs, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad); } // namespace Kernels::ElementUnary } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h index 6d5141f489..f5b2561b56 100644 --- a/lib/kernels/include/kernels/embedding_kernels.h +++ b/lib/kernels/include/kernels/embedding_kernels.h @@ -17,11 +17,11 @@ void forward_kernel(ffStream_t stream, int out_dim, int batch_size); void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, - DataType input_data_type, DataType output_data_type, + DataType input_data_type, std::optional aggr, int in_dim, int out_dim, diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h index 41b411c937..d60a1a5157 100644 --- a/lib/kernels/include/kernels/flat_kernels.h +++ b/lib/kernels/include/kernels/flat_kernels.h @@ -9,10 +9,11 @@ namespace FlexFlow::Kernels::Flat { void forward_kernel(ffStream_t stream, GenericTensorAccessorR input, float *output_ptr); -void backward_kernel(ffStream_t stream, + +void backward_kernel(cudaStream_t stream, GenericTensorAccessorR input, - float *input_grad_ptr, - float const *output_grad_ptr); + float const *output_grad_ptr, + float *input_grad_ptr); } // namespace FlexFlow::Kernels::Flat diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h index cff6563629..cd581b0a25 100644 --- a/lib/kernels/include/kernels/linear_kernels.h +++ b/lib/kernels/include/kernels/linear_kernels.h @@ -60,10 +60,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *kernel_ptr, float *kernel_grad_ptr, float *bias_ptr, diff --git a/lib/kernels/include/kernels/loss_function_kernels.h b/lib/kernels/include/kernels/loss_function_kernels.h index bab404f884..9e0dbd4ba1 100644 --- a/lib/kernels/include/kernels/loss_function_kernels.h +++ b/lib/kernels/include/kernels/loss_function_kernels.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H -#include "kernels/device.h" +#include "device.h" namespace FlexFlow { diff --git a/lib/kernels/include/kernels/metrics_kernels.h b/lib/kernels/include/kernels/metrics_kernels.h index e4660808b9..d961ee7503 100644 --- a/lib/kernels/include/kernels/metrics_kernels.h +++ b/lib/kernels/include/kernels/metrics_kernels.h @@ -1,25 +1,24 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H -#include "perf_metrics.h" +#include "kernels/perf_metrics.h" +#include "pcg/metric.h" namespace FlexFlow { -void update_metrics_sparse_label_kernel(ffStream_t, - MetricsAttrs const &, - float const *logit_ptr, - int const *label_ptr, - int num_samples, - int num_classes, - PerfMetrics &perf_zc); -void update_metrics_label_kernel(ffStream_t, - MetricsAttrs const &, - float const *logit_ptr, - float const *label_ptr, - int num_samples, - int num_classes, - PerfMetrics &perf_zc); +void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, + int const *label_ptr, + MetricsAttrs const *me, + int num_effective_samples, + int num_classes, + PerfMetrics &perf_zc); +void update_metrics_label_kernel_wrapper(float const *logit_ptr, + float const *label_ptr, + MetricsAttrs const *me, + int num_samples, + int num_classes, + PerfMetrics &perf_zc); } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h index 9ca6bf8e2b..3b5d292a5f 100644 --- a/lib/kernels/include/kernels/optimizer_kernels.h +++ b/lib/kernels/include/kernels/optimizer_kernels.h @@ -2,53 +2,91 @@ #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H #include "device.h" +#include "kernels/ff_handle.h" +#include "kernels/nccl.h" +#include "kernels/per_device_op_state.dtg.h" namespace FlexFlow { -void sgd_ps_update_task_gpu(ffStream_t, - float lr, - float momentum, - bool nesterov, +__global__ void sgd_update(size_t count, + float lr, + float weight_decay, + float momentum, + bool nesterov, + float const *WGrad, + float *V, + float *W); + +class SGDOptimizer { +public: + static __host__ void ps_update_task_gpu(SGDOptimizer const *op, + float const *w_grad_ptr, + size_t size, + int num_replicas, + float *w_ptr, + float *v_ptr); + +#ifdef FF_USE_NCCL + static __host__ void nccl_update_task_gpu(SGDOptimizer const *op, + PerDeviceOpState const *meta, + float const *w_grad_ptr, + size_t size, + float *w_ptr, + float *v_ptr); +#endif + +public: + float lr; + float weight_decay; + float momentum; + bool nesterov; +}; + +__global__ void + add_kernel(int count, float scale, float const *src, float *dst); + +__global__ void scale_kernel(int count, float a, float b, float *ptr); + +__global__ void adam_update(int count, + float alpha_t, + float beta1, + float beta2, float weight_decay, - float const *weight_grad_ptr, - size_t size, - int num_replicas, - float *weight_ptr, - float *sgd_v_ptr); - -void sgd_nccl_update_task_gpu(ffStream_t, - float lr, - float momentum, - bool nesterov, - float weight_decay PerDeviceFFHandle const &, - float const *weight_grad_ptr, - size_t size, - float *weight_ptr, - float *sgd_v_ptr); - -void adam_ps_update_task_gpu(ffStream_t, - float alpha_t, - float beta1, - float beta2, - float weight_decay, - float epsilon, - float const *weight_grad_ptr, - float *adam_m_ptr, - float *adam_v_ptr, - float *weight_ptr); - -void adam_nccl_update_task_gpu(ffStream_t, - float alpha_t, - float beta1, - float beta2, - float weight_decay, - float epsilon, - PerDeviceFFHandle const &, - float const *weight_grad_ptr, - float *adam_m_ptr, - float *adam_v_ptr, - float *weight_ptr); + float epsilon, + float const *WGrad, + float *M, + float *V, + float *W); -} // namespace FlexFlow +class AdamOptimizer { +public: + static __host__ void ps_update_task_gpu(AdamOptimizer const *op, + float const *w_grad_ptr, + size_t size, + int num_replicas, + float *w_ptr, + float *v_ptr, + float *m_ptr); +#ifdef FF_USE_NCCL + static __host__ void nccl_update_task_gpu(AdamOptimizer const *op, + PerDeviceOpState const *meta, + float const *w_grad_ptr, + size_t size, + float *w_ptr, + float *v_ptr, + float *m_ptr); #endif + +public: + float alpha; + float alpha_t; + float beta1; + float beta2; + float weight_decay; + float epsilon; +}; + +} // namespace FlexFlow + +#endif // _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h index e580c4a9de..9a303952d0 100644 --- a/lib/kernels/include/kernels/partition_kernels.h +++ b/lib/kernels/include/kernels/partition_kernels.h @@ -25,8 +25,8 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, RepartitionPerDeviceState const &m, - GenericTensorAccessorW const &output_grad, - GenericTensorAccessorR const &input_grad); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); } // namespace Kernels::Repartition } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml similarity index 100% rename from lib/local-execution/include/local-execution/per_device_op_state.variant.toml rename to lib/kernels/include/kernels/per_device_op_state.variant.toml diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h index 191c23bc98..c0e57e2c9a 100644 --- a/lib/kernels/include/kernels/pool_2d_kernels.h +++ b/lib/kernels/include/kernels/pool_2d_kernels.h @@ -67,12 +67,13 @@ void forward_kernel(ffStream_t stream, void const *input_ptr, void *output_ptr); -void backward_kernel(ffStream_t stream, +void backward_kernel(cudaStream_t stream, Pool2DPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, void const *output_ptr, - void const *output_grad_ptr); + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr); + } // namespace Kernels::Pool2D } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h index 7e1e240ea4..12553edd5e 100644 --- a/lib/kernels/include/kernels/reduction_kernels.h +++ b/lib/kernels/include/kernels/reduction_kernels.h @@ -12,8 +12,8 @@ void forward_kernel(ffStream_t stream, size_t num_replicas); void backward_kernel(ffStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); } // namespace FlexFlow::Kernels::Reduction diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index 5fa4382c43..6e19a9d251 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -24,8 +24,8 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, ReshapePerDeviceState const &per_device_state, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); } // namespace Kernels::Reshape } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h index 93135cb648..520ea61b64 100644 --- a/lib/kernels/include/kernels/softmax_kernels.h +++ b/lib/kernels/include/kernels/softmax_kernels.h @@ -30,8 +30,8 @@ void forward_kernel(ffStream_t stream, float *output_ptr); void backward_kernel(ffStream_t stream, - float *input_grad_ptr, float const *output_grad_ptr, + float *input_grad_ptr, size_t num_elements); } // namespace Kernels::Softmax diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h index b48b7e0aa8..dbf78826cb 100644 --- a/lib/kernels/include/kernels/transpose_kernels.h +++ b/lib/kernels/include/kernels/transpose_kernels.h @@ -28,8 +28,8 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, TransposePerDeviceState const &m, - GenericTensorAccessorW const &in_grad, - GenericTensorAccessorR const &out_grad); + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad); } // namespace Kernels::Transpose } // namespace FlexFlow diff --git a/lib/kernels/src/cpu/cast_kernels.cc b/lib/kernels/src/cpu/cast_kernels.cc index 5a00503fe4..08f5552afc 100644 --- a/lib/kernels/src/cpu/cast_kernels.cc +++ b/lib/kernels/src/cpu/cast_kernels.cc @@ -28,11 +28,11 @@ struct CPUForwardKernel { template struct CPUBackwardKernel { - void operator()(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - size_t volume = input.shape.get_volume(); + void operator()(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + size_t volume = output.shape.get_volume(); cpu_cast_backward( - input.get(), output.get(), volume, cast_to(1.0f)); + output.get(), input.get(), volume, cast_to(1.0f)); } }; @@ -42,10 +42,10 @@ void cpu_forward_kernel(GenericTensorAccessorR const &input, input.data_type, output.data_type, input, output); } -void cpu_backward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { DataTypeDispatch2{}( - input.data_type, output.data_type, input, output); + output.data_type, input.data_type, output, input); } } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu index 2ff02038f4..b30cf6a663 100644 --- a/lib/kernels/src/cuda/cuda_helper.cu +++ b/lib/kernels/src/cuda/cuda_helper.cu @@ -29,13 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) { #error "Unknown device, please make sure if CUDA is enabled" #endif -__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { +__global__ void scale_kernel(float *ptr, size_t size, float a, float b) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = (b - a) * ptr[i] + a; } } -__global__ void ones_kernel(float *ptr, coord_t size) { +__global__ void ones_kernel(float *ptr, size_t size) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = 1.0f; } @@ -49,7 +49,7 @@ __global__ void assign_kernel(DT *ptr, size_t size, DT value) { } template -__global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { +__global__ void copy_kernel(DT *dst, const DT *src, size_t size) { CUDA_KERNEL_LOOP(i, size) { dst[i] = src[i]; } @@ -281,11 +281,11 @@ template __global__ void add_kernel(bool *dst, bool const *src, unsigned long size); template __global__ void - copy_kernel(float *dst, float const *src, coord_t size); + copy_kernel(float *dst, float const *src, size_t size); template __global__ void - copy_kernel(int32_t *dst, int32_t const *src, coord_t size); + copy_kernel(int32_t *dst, int32_t const *src, size_t size); template __global__ void - copy_kernel(int64_t *dst, int64_t const *src, coord_t size); + copy_kernel(int64_t *dst, int64_t const *src, size_t size); template __global__ void apply_add_with_scale(float *data_ptr, float const *grad_ptr, diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu index e6a614ba70..c83e9f0a94 100644 --- a/lib/kernels/src/cuda/embedding_kernels.cu +++ b/lib/kernels/src/cuda/embedding_kernels.cu @@ -17,12 +17,11 @@ #include "kernels/datatype_dispatch.h" #include "kernels/embedding_kernels.h" -namespace FlexFlow { -namespace Kernels { -namespace Embedding { +namespace FlexFlow::Kernels::Embedding { void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); // Randomly initialize the intput tensor to avoid out of index range issues rand_generate_int<<>>( @@ -31,36 +30,14 @@ void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p) { cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); // Randomly initialize the intput tensor to avoid out of index range issues rand_generate_int<<>>( ptr, size, p); } -template -__global__ void embed_forward_no_aggr( - TI const *input, TD *output, TD const *embed, int out_dim, int batch_size); -template -__global__ void embed_forward_with_aggr(TI const *input, - TD *output, - TD const *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr); -template -__global__ void embed_backward_no_aggr( - TI const *input, TD const *output, TD *embed, int out_dim, int batch_size); -template -__global__ void embed_backward_with_aggr(TI const *input, - TD const *output, - TD *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr); - -template +template __global__ void embed_forward_no_aggr(int32_t const *input, TD *output, TD const *embed, @@ -75,7 +52,7 @@ __global__ void embed_forward_no_aggr(int32_t const *input, } } -template +template __global__ void embed_forward_no_aggr(int64_t const *input, TD *output, TD const *embed, @@ -90,14 +67,14 @@ __global__ void embed_forward_no_aggr(int64_t const *input, } } -template +template __global__ void embed_forward_with_aggr(int32_t const *input, TD *output, TD const *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; @@ -115,14 +92,14 @@ __global__ void embed_forward_with_aggr(int32_t const *input, } } -template +template __global__ void embed_forward_with_aggr(int64_t const *input, TD *output, TD const *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; @@ -140,7 +117,7 @@ __global__ void embed_forward_with_aggr(int64_t const *input, } } -template +template __global__ void embed_backward_no_aggr(int32_t const *input, TD const *output, TD *embed, @@ -154,7 +131,7 @@ __global__ void embed_backward_no_aggr(int32_t const *input, } } -template +template __global__ void embed_backward_no_aggr(int64_t const *input, TD const *output, TD *embed, @@ -171,11 +148,11 @@ __global__ void embed_backward_no_aggr(int64_t const *input, // Specialization for half type template <> -__global__ void embed_backward_no_aggr(int32_t const *input, - half const *output, - half *embed, - int out_dim, - int batch_size) { +__global__ void embed_backward_no_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; @@ -192,11 +169,11 @@ __global__ void embed_backward_no_aggr(int32_t const *input, } template <> -__global__ void embed_backward_no_aggr(int64_t const *input, - half const *output, - half *embed, - int out_dim, - int batch_size) { +__global__ void embed_backward_no_aggr(int64_t const *input, + half const *output, + half *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; @@ -212,14 +189,14 @@ __global__ void embed_backward_no_aggr(int64_t const *input, } } -template +template __global__ void embed_backward_with_aggr(int32_t const *input, TD const *output, TD *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -238,14 +215,14 @@ __global__ void embed_backward_with_aggr(int32_t const *input, } } -template +template __global__ void embed_backward_with_aggr(int64_t const *input, TD const *output, TD *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -267,14 +244,13 @@ __global__ void embed_backward_with_aggr(int64_t const *input, // Specialization for half type template <> -__global__ void - embed_backward_with_aggr(int32_t const *input, - half const *output, - half *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr) { +__global__ void embed_backward_with_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -301,14 +277,13 @@ __global__ void } template <> -__global__ void - embed_backward_with_aggr(int64_t const *input, - half const *output, - half *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr) { +__global__ void embed_backward_with_aggr(int64_t const *input, + half const *output, + half *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -351,35 +326,219 @@ struct ForwardKernel { int in_dim, int out_dim, int batch_size) { - assert(input.data_type == DataType::INT32 || - input.data_type == DataType::INT64); - assert(weight.data_type == DataType::HALF || - weight.data_type == DataType::FLOAT || - weight.data_type == DataType::DOUBLE); + throw mk_runtime_error(fmt::format( + "Invalid type combination: input type {} and output type {}", TI, TD)); + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { if (!aggr.has_value()) { - embed_forward_no_aggr, real_type_t> + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr <<>>(input.get(), - output.get(), - weight.get(), + stream>>>(input.get(), + output.get(), + weight.get(), out_dim, - batch_size); + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - embed_forward_with_aggr, real_type_t> + embed_forward_with_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr<<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr <<>>(input.get(), - output.get(), - weight.get(), + stream>>>(input.get(), + output.get(), + weight.get(), out_dim, in_dim, batch_size, - aggr); + aggr.value()); } } }; @@ -388,39 +547,229 @@ template struct BackwardKernel { void operator()(cudaStream_t stream, std::optional aggr, + GenericTensorAccessorR const &output, GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + throw mk_runtime_error(fmt::format( + "Invalid type combination: input type {} and output type {}", TI, TD)); + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, int in_dim, int out_dim, int batch_size) { - assert(input.data_type == DataType::INT32 || - input.data_type == DataType::INT64); - assert(output.data_type == DataType::HALF || - output.data_type == DataType::FLOAT || - output.data_type == DataType::DOUBLE); if (!aggr.has_value()) { - embed_backward_no_aggr, real_type_t> + embed_backward_no_aggr <<>>(input.get(), - output.get(), - weight_grad.get(), + stream>>>(input.get(), + output.get(), + weight_grad.get(), out_dim, batch_size); } else { - embed_backward_with_aggr, real_type_t> + embed_backward_with_aggr <<>>(input.get(), - output.get(), - weight_grad.get(), + stream>>>(input.get(), + output.get(), + weight_grad.get(), out_dim, in_dim, batch_size, - aggr); + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); } } }; @@ -448,27 +797,25 @@ void forward_kernel(ffStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorR const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, - DataType input_data_type, DataType output_data_type, + DataType input_data_type, std::optional aggr, int in_dim, int out_dim, int batch_size) { - DataTypeDispatch2{}(input_data_type, - output_data_type, + DataTypeDispatch2{}(output_data_type, + input_data_type, stream, aggr, - input, output, + input, weight_grad, in_dim, out_dim, batch_size); } -} // namespace Embedding -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Embedding diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu index 2e037eb472..2901f1d374 100644 --- a/lib/kernels/src/cuda/metrics_functions.cu +++ b/lib/kernels/src/cuda/metrics_functions.cu @@ -13,17 +13,42 @@ * limitations under the License. */ -#include "flexflow/model.h" -#include "flexflow/utils/cuda_helper.h" +#include "device.h" +#include "kernels/metrics_kernels.h" +#include "kernels/perf_metrics.h" +#include "pcg/metric.h" namespace FlexFlow { +struct CUDAPerfMetrics { + int train_all; + int train_correct; + float cce_loss; + float sparse_cce_loss; + float mse_loss; + float rmse_loss; + float mae_loss; + double start_time; + double current_time; + + CUDAPerfMetrics() = delete; + CUDAPerfMetrics(PerfMetrics const &perf) + : train_all(perf.train_all), + train_correct(perf.train_correct.value_or(-1)), + cce_loss(perf.cce_loss.value_or(-1)), + sparse_cce_loss(perf.sparse_cce_loss.value_or(-1)), + mse_loss(perf.mse_loss.value_or(-1)), + rmse_loss(perf.rmse_loss.value_or(-1)), + mae_loss(perf.mae_loss.value_or(-1)), start_time(perf.start_time), + current_time(perf.current_time) {} +}; + float const LOG_MIN_VALUE = 0.00000001f; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, - PerfMetrics *perf, - const Metrics metrics, + CUDAPerfMetrics *perf, + const MetricsAttrs metrics, int num_samples, int num_classes) { CUDA_KERNEL_LOOP(b, num_samples) { @@ -72,8 +97,8 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, __global__ void update_metrics_label_kernel(float const *logits, float const *labels, - PerfMetrics *perf, - const Metrics metrics, + CUDAPerfMetrics *perf, + const MetricsAttrs metrics, int num_samples, int num_classes) { CUDA_KERNEL_LOOP(b, num_samples) { @@ -136,17 +161,17 @@ __global__ void update_metrics_label_kernel(float const *logits, } } -void Metrics::update_metrics_sparse_label_kernel_wrapper( - float const *logit_ptr, - int const *label_ptr, - Metrics const *me, - int num_effective_samples, - int num_classes, - PerfMetrics &perf_zc) { - PerfMetrics *perf; - checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics))); - checkCUDA( - cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice)); +void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, + int const *label_ptr, + MetricsAttrs const *me, + int num_effective_samples, + int num_classes, + PerfMetrics &perf_zc) { + CUDAPerfMetrics perf(perf_zc); + CUDAPerfMetrics *perf_cuda; + checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics))); + checkCUDA(cudaMemcpy( + perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice)); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -154,32 +179,36 @@ void Metrics::update_metrics_sparse_label_kernel_wrapper( CUDA_NUM_THREADS, 0, stream>>>( - logit_ptr, label_ptr, perf, *me, num_effective_samples, num_classes); + logit_ptr, label_ptr, perf_cuda, *me, num_effective_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); - checkCUDA( - cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost)); - checkCUDA(cudaFree(perf)); + checkCUDA(cudaMemcpy( + &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); + checkCUDA(cudaFree(perf_cuda)); } -void Metrics::update_metrics_label_kernel_wrapper(float const *logit_ptr, - float const *label_ptr, - Metrics const *me, - int num_samples, - int num_classes, - PerfMetrics &perf_zc) { - PerfMetrics *perf; - checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics))); - checkCUDA( - cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice)); +void update_metrics_label_kernel_wrapper(float const *logit_ptr, + float const *label_ptr, + MetricsAttrs const *me, + int num_samples, + int num_classes, + PerfMetrics &perf_zc) { + CUDAPerfMetrics perf(perf_zc); + CUDAPerfMetrics *perf_cuda; + checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics))); + checkCUDA(cudaMemcpy( + perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice)); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - update_metrics_label_kernel<<>>( - logit_ptr, label_ptr, perf, *me, num_samples, num_classes); + update_metrics_label_kernel<<>>( + logit_ptr, label_ptr, perf_cuda, *me, num_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); - checkCUDA( - cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost)); - checkCUDA(cudaFree(perf)); + checkCUDA(cudaMemcpy( + &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); + checkCUDA(cudaFree(perf_cuda)); } }; // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu index 6c6e17a181..512981e32b 100644 --- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu +++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu @@ -53,9 +53,9 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, BatchNormPerDeviceState const &m, - float const *input_ptr, - float *output_grad_ptr, float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, float *input_grad_ptr, float const *scale_ptr, float *scale_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu index dc342fd0e0..afc3e1f7ef 100644 --- a/lib/kernels/src/cuda/ops/cast_kernels.cu +++ b/lib/kernels/src/cuda/ops/cast_kernels.cu @@ -50,11 +50,11 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - size_t volume = input.shape.get_volume(); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + size_t volume = output.shape.get_volume(); cast_backward<<>>( - input.get(), output.get(), volume, cast_to(1.0f)); + output.get(), input.get(), volume, cast_to(1.0f)); } }; @@ -66,10 +66,10 @@ void forward_kernel(ffStream_t stream, } void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { DataTypeDispatch2{}( - input.data_type, output.data_type, stream, input, output); + output.data_type, input.data_type, stream, output, input); } } // namespace Cast diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu index e3a4c97a31..0a4024ba8a 100644 --- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu @@ -313,10 +313,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *filter_ptr, float *filter_grad_ptr, float *bias_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu index a35d28fa8c..687a9fa220 100644 --- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu +++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu @@ -290,10 +290,10 @@ struct BackwardKernel { OperatorType op_type, std::optional scalar, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { checkCUDNN(cudnnSetStream(handle.dnn, stream)); if (use_cudnn(op_type)) { @@ -356,20 +356,20 @@ void backward_kernel(ffStream_t stream, ElementUnaryPerDeviceState const &device_state, ElementUnaryAttrs const &attrs, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { DataTypeDispatch1{}(input.data_type, stream, device_state, get_op_type(attrs), attrs.scalar, handle, - input, - input_grad, output, - output_grad); + output_grad, + input, + input_grad); } } // namespace ElementUnary diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu index 941db108a0..f661e5fb0a 100644 --- a/lib/kernels/src/cuda/ops/flat_kernels.cu +++ b/lib/kernels/src/cuda/ops/flat_kernels.cu @@ -34,8 +34,8 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, GenericTensorAccessorR input, - float *input_grad_ptr, - float const *output_grad_ptr) { + float const *output_grad_ptr, + float *input_grad_ptr) { float alpha = 1.0f; apply_add_with_scale diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu index 6b069218fa..0d5a772918 100644 --- a/lib/kernels/src/cuda/ops/linear_kernels.cu +++ b/lib/kernels/src/cuda/ops/linear_kernels.cu @@ -191,10 +191,10 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *kernel_ptr, float *kernel_grad_ptr, float *bias_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu index 1d07efb5fa..3687c1cedf 100644 --- a/lib/kernels/src/cuda/ops/partition_kernels.cu +++ b/lib/kernels/src/cuda/ops/partition_kernels.cu @@ -39,8 +39,8 @@ template struct BackwardKernel { void operator()(cudaStream_t stream, RepartitionPerDeviceState const &m, - GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { add_kernel><<{}( - m.data_type, stream, m, input_grad, output_grad); + m.data_type, stream, m, output_grad, input_grad); } } // namespace Repartition diff --git a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu index 51fa29d289..f8b35ec885 100644 --- a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu @@ -112,10 +112,10 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, Pool2DPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, void const *output_ptr, - void const *output_grad_ptr) { + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu index 0c6ba7d8e3..9c3e8dcc40 100644 --- a/lib/kernels/src/cuda/ops/reduction_kernels.cu +++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu @@ -54,8 +54,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { checkCUDA(cudaMemcpyAsync(input.get(), output.get(), input.shape.num_elements() * size_of_datatype(T), @@ -73,9 +73,9 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { - DataTypeDispatch1{}(input.data_type, stream, input, output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch1{}(output.data_type, stream, output, input); } } // namespace Reduction diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu index 5b7843a3a5..b7a328ca08 100644 --- a/lib/kernels/src/cuda/ops/reshape_kernels.cu +++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu @@ -42,8 +42,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { float alpha = 1.0f; apply_add_with_scale> <<{}(m.data_type, stream, input, output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch1{}(m.data_type, stream, output, input); } } // namespace Reshape diff --git a/lib/kernels/src/cuda/ops/softmax_kernels.cu b/lib/kernels/src/cuda/ops/softmax_kernels.cu index 93ed85de18..d2498d08a4 100644 --- a/lib/kernels/src/cuda/ops/softmax_kernels.cu +++ b/lib/kernels/src/cuda/ops/softmax_kernels.cu @@ -61,8 +61,8 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - float *input_grad_ptr, float const *output_grad_ptr, + float *input_grad_ptr, size_t num_elements) { checkCUDA(cudaMemcpyAsync(input_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu index 3b3f80944d..37e1a08326 100644 --- a/lib/kernels/src/cuda/ops/transpose_kernels.cu +++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu @@ -91,8 +91,8 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, TransposePerDeviceState const &m, - GenericTensorAccessorW const &in_grad, - GenericTensorAccessorR const &out_grad) { + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad) { TransposeStrides info; info.num_dim = in_grad.shape.num_dims(); diff --git a/lib/kernels/src/cuda/optimizer_kernel.cu b/lib/kernels/src/cuda/optimizer_kernels.cu similarity index 80% rename from lib/kernels/src/cuda/optimizer_kernel.cu rename to lib/kernels/src/cuda/optimizer_kernels.cu index 439eed9dec..237a277b21 100644 --- a/lib/kernels/src/cuda/optimizer_kernel.cu +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -13,7 +13,9 @@ * limitations under the License. */ +#include "device.h" #include "kernels/optimizer_kernels.h" +#include "utils/exception.h" namespace FlexFlow { @@ -80,13 +82,28 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); + + const auto& state = meta->raw_variant; + ncclComm_t comm = std::visit([](const auto& s) -> ncclComm_t { + using T = std::decay_t; + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + throw mk_runtime_error("State type does not support NCCL operations"); + } else { + return s.handle.ncclComm; + } + }, state); + checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); + (float *)w_grad_ptr, + size, + ncclFloat, + ncclSum, + comm, + stream)); + // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr); // print_tensor((float*)w_grad_ptr, 16, "[After ncclAllReduce]"); @@ -157,7 +174,7 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, for (int i = 1; i < num_replicas; i++) { float const *src = w_grad_ptr + i * size; add_kernel<<>>( - size, 1.0f, src, (float *)w_grad_ptr); + (float *)w_grad_ptr, src, size); } // checkCUDA(cudaDeviceSynchronize()); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", @@ -188,13 +205,27 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, // Use NCCL to sync gradients cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); + + const auto& state = meta->raw_variant; + ncclComm_t comm = std::visit([](const auto& s) -> ncclComm_t { + using T = std::decay_t; + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + throw mk_runtime_error("State type does not support NCCL operations"); + } else { + return s.handle.ncclComm; + } + }, state); + checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); + (float *)w_grad_ptr, + size, + ncclFloat, + ncclSum, + comm, + stream)); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", // op->alpha, op->alpha_t, op->weight_decay); // Step 2: Adam update diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 611069ac93..03a3a1ad40 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -68,9 +68,9 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(), state, - input_accessor.get_float_ptr(), - output_grad_accessor.get_float_ptr(), output_accessor.get_float_ptr(), + output_grad_accessor.get_float_ptr(), + input_accessor.get_float_ptr(), input_grad_accessor.get_float_ptr(), scale_accessor.get_float_ptr(), scale_grad_accessor.get_float_ptr(), diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index b8f128b761..0bb69aa1dc 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -33,15 +33,15 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w( + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( output_shape, allocator, make_float_data_type_value(0)); GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( input_shape, allocator, make_float_data_type_value(1)); Kernels::Flat::backward_kernel(managed_stream.raw_stream(), input_accessor, - input_grad_accessor.get_float_ptr(), - output_grad_accessor.get_float_ptr()); + output_grad_accessor.get_float_ptr(), + input_grad_accessor.get_float_ptr()); CHECK(contains_non_zero(input_grad_accessor)); } diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 4beae62553..e88c811803 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -41,8 +41,8 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Repartition::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); + output_grad_accessor, + input_grad_accessor); CHECK(contains_non_zero(input_grad_accessor)); } diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 2a4d3caf9a..00fa968235 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -66,10 +66,10 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Pool2D::backward_kernel(managed_stream.raw_stream(), state, - input_accessor.ptr, - input_grad_accessor.ptr, output_accessor.ptr, - output_grad_accessor.ptr); + output_grad_accessor.ptr, + input_accessor.ptr, + input_grad_accessor.ptr); CHECK(contains_non_zero(input_grad_accessor)); } diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 3c3e828049..1c389cb20d 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -44,8 +44,8 @@ TEST_SUITE(FF_TEST_SUITE) { allocator.allocate_tensor(input_shape); Kernels::Reduction::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor, - output_grad_accessor); + output_grad_accessor, + input_grad_accessor); CHECK(contains_non_zero(input_grad_accessor)); } diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index 55797aeff6..5c04012da2 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -39,8 +39,8 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Reshape::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); + output_grad_accessor, + input_grad_accessor); CHECK(contains_non_zero(input_grad_accessor)); } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index bb6bcb949b..5519c30b80 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -45,8 +45,8 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Softmax::backward_kernel( managed_stream.raw_stream(), - input_grad_accessor.get_float_ptr(), output_grad_accessor.get_float_ptr(), + input_grad_accessor.get_float_ptr(), output_grad_accessor.shape.num_elements()); CHECK(contains_non_zero(input_grad_accessor)); diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index b9ef82a764..0bc85cb8e0 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -43,8 +43,8 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); + output_grad_accessor, + input_grad_accessor); CHECK(contains_non_zero(input_grad_accessor)); } diff --git a/lib/local-execution/include/local-execution/per_device_op_state.h b/lib/local-execution/include/local-execution/per_device_op_state.h index 1edd5b6360..f1f357a86e 100644 --- a/lib/local-execution/include/local-execution/per_device_op_state.h +++ b/lib/local-execution/include/local-execution/per_device_op_state.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H #define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H +#include "kernels/per_device_op_state.dtg.h" #include "local-execution/device_specific_device_states.dtg.h" -#include "local-execution/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 54c8dfc5f1..48584588e3 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H +#include "kernels/per_device_op_state.dtg.h" #include "local-execution/device_specific.h" #include "local-execution/itask_argument_accessor.h" -#include "local-execution/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc index 851566fc02..3aed3111c7 100644 --- a/lib/local-execution/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -133,9 +133,9 @@ static std::optional profiling, "[BatchNorm] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - output_grad.get_float_ptr(), output.get_float_ptr(), + output_grad.get_float_ptr(), + input.get_float_ptr(), input_grad.get_float_ptr(), scale.get_float_ptr(), scale_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc index d5c6e7f851..d7c5c22170 100644 --- a/lib/local-execution/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -108,8 +108,8 @@ static std::optional acc.get_argument(PER_DEVICE_STATE); auto attrs = acc.get_argument(ATTRS); - auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); auto filter = acc.get_tensor(FILTER); auto input_grad = acc.get_tensor_grad(INPUT); @@ -121,10 +121,10 @@ static std::optional profiling, "[Conv2d] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), filter.get_float_ptr(), filter_grad.get_float_ptr(), bias_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index 4ee609bd6c..10f1dce294 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -89,10 +89,10 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor_grad(INPUT); auto const &attrs = acc.get_argument(ATTRS); auto handle = acc.get_argument(HANDLE); @@ -107,10 +107,10 @@ static std::optional per_device_state, attrs, handle, - input, - input_grad, output, - output_grad); + output_grad, + input, + input_grad); } TaskImplFunction get_element_unary_init_task_impl() { diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc index 3fe5029fa1..8d998a8672 100644 --- a/lib/local-execution/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -41,15 +41,15 @@ static std::optional ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); return profile(backward_kernel, profiling, "[Flat] backward_time = {:.2lf}ms\n", input, - input_grad.get_float_ptr(), - output_grad.get_float_ptr()); + output_grad.get_float_ptr(), + input_grad.get_float_ptr()); } TaskImplFunction get_flat_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 1c5d5136cd..b567937c70 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -148,10 +148,10 @@ static std::optional profiling, "[Linear] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), weight.get_float_ptr(), weight_grad.get_float_ptr(), bias_ptr, diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc index be51ea9526..2e7fb8ce91 100644 --- a/lib/local-execution/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -125,19 +125,19 @@ static std::optional Pool2DPerDeviceState state = acc.get_argument(PER_DEVICE_STATE); - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor(INPUT); return profile(backward_kernel, profiling, "[Pool2D] backward_time = {:.2lf}ms\n", state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), - output_grad.get_float_ptr()); + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr()); } TaskImplFunction get_pool_2d_init_task_impl() { diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc index a58d79a4f8..1e85d7186e 100644 --- a/lib/local-execution/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -64,13 +64,13 @@ static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); return profile(backward_kernel, profiling, "[Reduction] backward_time = {:.2lf}ms\n", - input_grad, - output_grad); + output_grad, + input_grad); } TaskImplFunction get_reduction_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc index 73692f4a13..655e1f238b 100644 --- a/lib/local-execution/src/ops/repartition.cc +++ b/lib/local-execution/src/ops/repartition.cc @@ -86,8 +86,8 @@ static std::optional ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(INPUT); + auto input_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc index 7584d405eb..761718a9a7 100644 --- a/lib/local-execution/src/ops/reshape.cc +++ b/lib/local-execution/src/ops/reshape.cc @@ -87,8 +87,8 @@ static std::optional profiling, "[Reshape] backward time = {:.2lf}ms\n", per_device_state, - input_grad, - output_grad); + output_grad, + input_grad); } TaskImplFunction get_reshape_init_task_impl() { diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc index 4c7979ae9b..9c5757112c 100644 --- a/lib/local-execution/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -102,8 +102,8 @@ static std::optional return profile(backward_kernel, profiling, "[SoftMax] backward_time = {:.2lf}ms\n", - input_grad.get_float_ptr(), output_grad.get_float_ptr(), + input_grad.get_float_ptr(), output_grad.shape.get_volume()); } diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc index 3e4ac15db3..0176e6d578 100644 --- a/lib/local-execution/src/ops/transpose.cc +++ b/lib/local-execution/src/ops/transpose.cc @@ -88,8 +88,8 @@ static std::optional profiling, "[Transpose] Backward_time = {:.2lf} [ms]", per_device_state, - input_grad, - output_grad); + output_grad, + input_grad); } OpTaskInvocation backward(TransposeAttrs const &attrs) { diff --git a/lib/local-execution/src/per_device_state.cc b/lib/local-execution/src/per_device_op_state.cc similarity index 100% rename from lib/local-execution/src/per_device_state.cc rename to lib/local-execution/src/per_device_op_state.cc diff --git a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml index 27aa50f38f..2c524c120a 100644 --- a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml +++ b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml @@ -10,5 +10,8 @@ features = [ [[values]] name = "SUM" -[[value]] +[[values]] name = "AVG" + +[[values]] +name = "NONE" diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h new file mode 100644 index 0000000000..723e69bddd --- /dev/null +++ b/lib/op-attrs/include/op-attrs/datatype_value.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H + +#include "op-attrs/datatype_value.dtg.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value); +DataTypeValue make_double_data_type_value(double value); +DataTypeValue make_int32_data_type_value(int32_t value); +DataTypeValue make_int64_data_type_value(int64_t value); +DataTypeValue make_bool_data_type_value(bool value); + +} // namespace FlexFlow + +#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H diff --git a/lib/op-attrs/include/op-attrs/make_datatype_value.h b/lib/op-attrs/include/op-attrs/make_datatype_value.h index c3289c6309..af4792dd9e 100644 --- a/lib/op-attrs/include/op-attrs/make_datatype_value.h +++ b/lib/op-attrs/include/op-attrs/make_datatype_value.h @@ -11,6 +11,6 @@ DataTypeValue make_int32_data_type_value(int32_t value); DataTypeValue make_int64_data_type_value(int64_t value); DataTypeValue make_bool_data_type_value(bool value); -} +} // namespace FlexFlow #endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H diff --git a/lib/op-attrs/src/op-attrs/make_datatype_value.cc b/lib/op-attrs/src/op-attrs/make_datatype_value.cc index bc402c433c..76d712949a 100644 --- a/lib/op-attrs/src/op-attrs/make_datatype_value.cc +++ b/lib/op-attrs/src/op-attrs/make_datatype_value.cc @@ -11,15 +11,15 @@ DataTypeValue make_double_data_type_value(double value) { } DataTypeValue make_int32_data_type_value(int32_t value) { - return DataTypeValue{value}; + return DataTypeValue{value}; } DataTypeValue make_int64_data_type_value(int64_t value) { - return DataTypeValue{value}; + return DataTypeValue{value}; } DataTypeValue make_bool_data_type_value(bool value) { - return DataTypeValue{value}; -} - + return DataTypeValue{value}; } + +} // namespace FlexFlow diff --git a/lib/pcg/include/pcg/metric.h b/lib/pcg/include/pcg/metric.h new file mode 100644 index 0000000000..f56078772e --- /dev/null +++ b/lib/pcg/include/pcg/metric.h @@ -0,0 +1,73 @@ +#ifndef _FF_METRICS_H_ +#define _FF_METRICS_H_ + +#include +#include "utils/fmt.h" +#include "op-attrs/ops/loss_functions/loss_functions.h" + +namespace FlexFlow { + +enum class Metric { + ACCURACY, + CATEGORICAL_CROSSENTROPY, + SPARSE_CATEGORICAL_CROSSENTROPY, + MEAN_SQUARED_ERROR, + ROOT_MEAN_SQUARED_ERROR, + MEAN_ABSOLUTE_ERROR, +}; + +class MetricsAttrs { +public: + MetricsAttrs() = delete; + MetricsAttrs(LossFunction, std::vector const &); + +public: + LossFunction loss_type; + bool measure_accuracy; + bool measure_categorical_crossentropy; + bool measure_sparse_categorical_crossentropy; + bool measure_mean_squared_error; + bool measure_root_mean_squared_error; + bool measure_mean_absolute_error; +}; + +} // namespace FlexFlow + +namespace fmt { + +template <> +struct formatter<::FlexFlow::Metric> : formatter { + template + auto format(::FlexFlow::Metric m, FormatContext &ctx) const + -> decltype(ctx.out()) { + using namespace FlexFlow; + + string_view name = "unknown"; + switch (m) { + case Metric::ACCURACY: + name = "Accuracy"; + break; + case Metric::CATEGORICAL_CROSSENTROPY: + name = "CategoricalCrossEntropy"; + break; + case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: + name = "SparseCategoricalCrossEntropy"; + break; + case Metric::MEAN_SQUARED_ERROR: + name = "MeanSquaredError"; + break; + case Metric::ROOT_MEAN_SQUARED_ERROR: + name = "RootMeanSquaredError"; + break; + case Metric::MEAN_ABSOLUTE_ERROR: + name = "MeanAbsoluteError"; + break; + } + return formatter::format(name, ctx); + } +}; + +} // namespace fmt + + +#endif diff --git a/lib/pcg/src/pcg/metric.cc b/lib/pcg/src/pcg/metric.cc new file mode 100644 index 0000000000..eb0d6bc5d0 --- /dev/null +++ b/lib/pcg/src/pcg/metric.cc @@ -0,0 +1,38 @@ +#include "pcg/metric.h" + +namespace FlexFlow { +MetricsAttrs::MetricsAttrs(LossFunction _loss_type, + std::vector const &metrics) + : loss_type(_loss_type), measure_accuracy(false), + measure_categorical_crossentropy(false), + measure_sparse_categorical_crossentropy(false), + measure_mean_squared_error(false), measure_root_mean_squared_error(false), + measure_mean_absolute_error(false) { +for (Metric const &m : metrics) { + switch (m) { + case Metric::ACCURACY: + measure_accuracy = true; + continue; + case Metric::CATEGORICAL_CROSSENTROPY: + measure_categorical_crossentropy = true; + continue; + case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: + measure_sparse_categorical_crossentropy = true; + continue; + case Metric::MEAN_SQUARED_ERROR: + measure_mean_squared_error = true; + continue; + case Metric::ROOT_MEAN_SQUARED_ERROR: + measure_root_mean_squared_error = true; + continue; + case Metric::MEAN_ABSOLUTE_ERROR: + measure_mean_absolute_error = true; + continue; + default: + throw mk_runtime_error("Initializing MetricsAttrs with unrecogonized metrics type"); + } +} +} + + +} diff --git a/lib/runtime/src/metrics_functions.cc b/lib/runtime/src/metrics_functions.cc index feb6e704b2..33e15baed2 100644 --- a/lib/runtime/src/metrics_functions.cc +++ b/lib/runtime/src/metrics_functions.cc @@ -25,39 +25,6 @@ namespace FlexFlow { LegionRuntime::Logger::Category log_metrics("metrics"); -MetricsAttrs::MetricsAttrs(LossFunction _loss_type, - std::vector const &metrics) - : loss_type(_loss_type), measure_accuracy(false), - measure_categorical_crossentropy(false), - measure_sparse_categorical_crossentropy(false), - measure_mean_squared_error(false), measure_root_mean_squared_error(false), - measure_mean_absolute_error(false) { - for (Metric const &m : metrics) { - switch (m) { - case Metric::ACCURACY: - measure_accuracy = true; - continue; - case Metric::CATEGORICAL_CROSSENTROPY: - measure_categorical_crossentropy = true; - continue; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - measure_sparse_categorical_crossentropy = true; - continue; - case Metric::MEAN_SQUARED_ERROR: - measure_mean_squared_error = true; - continue; - case Metric::ROOT_MEAN_SQUARED_ERROR: - measure_root_mean_squared_error = true; - continue; - case Metric::MEAN_ABSOLUTE_ERROR: - measure_mean_absolute_error = true; - continue; - default: - throw mk_runtime_error("Unrecogonized metrics type {}", m); - } - } -} - enum Slots { LOGIT, LABEL, diff --git a/lib/runtime/src/metrics_functions.h b/lib/runtime/src/metrics_functions.h index fbb0b633bf..73dc3bbc51 100644 --- a/lib/runtime/src/metrics_functions.h +++ b/lib/runtime/src/metrics_functions.h @@ -16,38 +16,13 @@ #ifndef _FF_METRICS_FUNCTIONS_H_ #define _FF_METRICS_FUNCTIONS_H_ +#include "kernels/metric.h" #include "kernels/perf_metrics.h" #include "legion.h" -#include "op-attrs/ops/loss_functions.h" #include "task_spec/task_invocation.h" -#include "utils/fmt.h" namespace FlexFlow { -enum class Metric { - ACCURACY, - CATEGORICAL_CROSSENTROPY, - SPARSE_CATEGORICAL_CROSSENTROPY, - MEAN_SQUARED_ERROR, - ROOT_MEAN_SQUARED_ERROR, - MEAN_ABSOLUTE_ERROR, -}; - -class MetricsAttrs { -public: - MetricsAttrs() = delete; - MetricsAttrs(LossFunction, std::vector const &); - -public: - LossFunction loss_type; - bool measure_accuracy; - bool measure_categorical_crossentropy; - bool measure_sparse_categorical_crossentropy; - bool measure_mean_squared_error; - bool measure_root_mean_squared_error; - bool measure_mean_absolute_error; -}; - TypedIndexTaskInvocation compute_metrics(MetricsAttrs const &, parallel_tensor_guid_t const &logit, @@ -79,40 +54,4 @@ VISITABLE_STRUCT(::FlexFlow::MetricsAttrs, measure_root_mean_squared_error, measure_mean_absolute_error); -namespace fmt { - -template <> -struct formatter<::FlexFlow::Metric> : formatter { - template - auto format(::FlexFlow::Metric m, FormatContext &ctx) const - -> decltype(ctx.out()) { - using namespace FlexFlow; - - string_view name = "unknown"; - switch (m) { - case Metric::ACCURACY: - name = "Accuracy"; - break; - case Metric::CATEGORICAL_CROSSENTROPY: - name = "CategoricalCrossEntropy"; - break; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - name = "SparseCategoricalCrossEntropy"; - break; - case Metric::MEAN_SQUARED_ERROR: - name = "MeanSquaredError"; - break; - case Metric::ROOT_MEAN_SQUARED_ERROR: - name = "RootMeanSquaredError"; - break; - case Metric::MEAN_ABSOLUTE_ERROR: - name = "MeanAbsoluteError"; - break; - } - return formatter::format(name, ctx); - } -}; - -} // namespace fmt - #endif diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc index 296b9f443b..f34751ef8d 100644 --- a/lib/runtime/src/ops/embedding.cc +++ b/lib/runtime/src/ops/embedding.cc @@ -77,11 +77,11 @@ static std::optional return profile(backward_kernel, profiling, "[Embedding] backward_time = {:.2lf}ms\n", - input, output, + input, weight_grad, - input.data_type, output.data_type, + input.data_type, attrs.aggr, input.shape.get_dim(), output.shape.get_dim(), From 8f0520387bd04d436d373354f917daf3708eed1a Mon Sep 17 00:00:00 2001 From: Dylan Lim Date: Thu, 21 Nov 2024 22:46:25 -0800 Subject: [PATCH 20/20] format check --- lib/kernels/include/kernels/pool_2d_kernels.h | 1 - lib/kernels/src/cuda/metrics_functions.cu | 5 +- lib/kernels/src/cuda/optimizer_kernels.cu | 76 +++++++++---------- lib/pcg/include/pcg/metric.h | 5 +- lib/pcg/src/pcg/metric.cc | 62 +++++++-------- 5 files changed, 69 insertions(+), 80 deletions(-) diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h index c0e57e2c9a..ad0a52efb9 100644 --- a/lib/kernels/include/kernels/pool_2d_kernels.h +++ b/lib/kernels/include/kernels/pool_2d_kernels.h @@ -74,7 +74,6 @@ void backward_kernel(cudaStream_t stream, void const *input_ptr, void *input_grad_ptr); - } // namespace Kernels::Pool2D } // namespace FlexFlow diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu index 2901f1d374..0250f829ec 100644 --- a/lib/kernels/src/cuda/metrics_functions.cu +++ b/lib/kernels/src/cuda/metrics_functions.cu @@ -200,10 +200,7 @@ void update_metrics_label_kernel_wrapper(float const *logit_ptr, cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - update_metrics_label_kernel<<>>( + update_metrics_label_kernel<<>>( logit_ptr, label_ptr, perf_cuda, *me, num_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); checkCUDA(cudaMemcpy( diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu index 237a277b21..1c6954a0b0 100644 --- a/lib/kernels/src/cuda/optimizer_kernels.cu +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -83,26 +83,23 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - const auto& state = meta->raw_variant; - ncclComm_t comm = std::visit([](const auto& s) -> ncclComm_t { - using T = std::decay_t; - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v) { - throw mk_runtime_error("State type does not support NCCL operations"); - } else { - return s.handle.ncclComm; - } - }, state); - - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - comm, - stream)); + auto const &state = meta->raw_variant; + ncclComm_t comm = std::visit( + [](auto const &s) -> ncclComm_t { + using T = std::decay_t; + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + throw mk_runtime_error("State type does not support NCCL operations"); + } else { + return s.handle.ncclComm; + } + }, + state); + + checkNCCL(ncclAllReduce( + w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream)); // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr); // print_tensor((float*)w_grad_ptr, 16, "[After ncclAllReduce]"); @@ -205,27 +202,24 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, // Use NCCL to sync gradients cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - - const auto& state = meta->raw_variant; - ncclComm_t comm = std::visit([](const auto& s) -> ncclComm_t { - using T = std::decay_t; - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v) { - throw mk_runtime_error("State type does not support NCCL operations"); - } else { - return s.handle.ncclComm; - } - }, state); - - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - comm, - stream)); + + auto const &state = meta->raw_variant; + ncclComm_t comm = std::visit( + [](auto const &s) -> ncclComm_t { + using T = std::decay_t; + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + throw mk_runtime_error("State type does not support NCCL operations"); + } else { + return s.handle.ncclComm; + } + }, + state); + + checkNCCL(ncclAllReduce( + w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream)); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", // op->alpha, op->alpha_t, op->weight_decay); // Step 2: Adam update diff --git a/lib/pcg/include/pcg/metric.h b/lib/pcg/include/pcg/metric.h index f56078772e..718919112f 100644 --- a/lib/pcg/include/pcg/metric.h +++ b/lib/pcg/include/pcg/metric.h @@ -1,9 +1,9 @@ #ifndef _FF_METRICS_H_ #define _FF_METRICS_H_ -#include -#include "utils/fmt.h" #include "op-attrs/ops/loss_functions/loss_functions.h" +#include "utils/fmt.h" +#include namespace FlexFlow { @@ -69,5 +69,4 @@ struct formatter<::FlexFlow::Metric> : formatter { } // namespace fmt - #endif diff --git a/lib/pcg/src/pcg/metric.cc b/lib/pcg/src/pcg/metric.cc index eb0d6bc5d0..69aba90d12 100644 --- a/lib/pcg/src/pcg/metric.cc +++ b/lib/pcg/src/pcg/metric.cc @@ -2,37 +2,37 @@ namespace FlexFlow { MetricsAttrs::MetricsAttrs(LossFunction _loss_type, - std::vector const &metrics) - : loss_type(_loss_type), measure_accuracy(false), - measure_categorical_crossentropy(false), - measure_sparse_categorical_crossentropy(false), - measure_mean_squared_error(false), measure_root_mean_squared_error(false), - measure_mean_absolute_error(false) { -for (Metric const &m : metrics) { - switch (m) { - case Metric::ACCURACY: - measure_accuracy = true; - continue; - case Metric::CATEGORICAL_CROSSENTROPY: - measure_categorical_crossentropy = true; - continue; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - measure_sparse_categorical_crossentropy = true; - continue; - case Metric::MEAN_SQUARED_ERROR: - measure_mean_squared_error = true; - continue; - case Metric::ROOT_MEAN_SQUARED_ERROR: - measure_root_mean_squared_error = true; - continue; - case Metric::MEAN_ABSOLUTE_ERROR: - measure_mean_absolute_error = true; - continue; - default: - throw mk_runtime_error("Initializing MetricsAttrs with unrecogonized metrics type"); + std::vector const &metrics) + : loss_type(_loss_type), measure_accuracy(false), + measure_categorical_crossentropy(false), + measure_sparse_categorical_crossentropy(false), + measure_mean_squared_error(false), measure_root_mean_squared_error(false), + measure_mean_absolute_error(false) { + for (Metric const &m : metrics) { + switch (m) { + case Metric::ACCURACY: + measure_accuracy = true; + continue; + case Metric::CATEGORICAL_CROSSENTROPY: + measure_categorical_crossentropy = true; + continue; + case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: + measure_sparse_categorical_crossentropy = true; + continue; + case Metric::MEAN_SQUARED_ERROR: + measure_mean_squared_error = true; + continue; + case Metric::ROOT_MEAN_SQUARED_ERROR: + measure_root_mean_squared_error = true; + continue; + case Metric::MEAN_ABSOLUTE_ERROR: + measure_mean_absolute_error = true; + continue; + default: + throw mk_runtime_error( + "Initializing MetricsAttrs with unrecogonized metrics type"); + } } } -} - -} +} // namespace FlexFlow