ArgReduction fix

isl-org · Dec 31, 2024 · 1c99a2b · 1c99a2b
1 parent e6f4946
commit 1c99a2b
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 44 deletions.
diff --git a/cpp/open3d/core/kernel/NonZeroSYCL.cpp b/cpp/open3d/core/kernel/NonZeroSYCL.cpp
@@ -47,7 +47,7 @@ Tensor NonZeroSYCL(const Tensor& src) {
     // MAX_DIMS: Maximum number of dimensions of TensorRef, defined in
     // Indexer.h.
     sycl::marray<int64_t, MAX_DIMS> shape_vec;  // device copyable
-    if (shape.size() <= MAX_DIMS) {
+    if (shape.size() > MAX_DIMS) {
         utility::LogError("Too many dimensions: {} > MAX_DIMS={}.",
                           shape.size(), MAX_DIMS);
     }

diff --git a/cpp/open3d/core/kernel/ReductionSYCL.cpp b/cpp/open3d/core/kernel/ReductionSYCL.cpp
@@ -24,23 +24,31 @@ namespace {
 template <typename scalar_t>
 struct ArgMinReduction {
     using basic_reduction = sycl::minimum<scalar_t>;
-    std::pair<int64_t, scalar_t> operator()(
-            std::pair<int64_t, scalar_t> a,
-            std::pair<int64_t, scalar_t> b) const {
-        return a.second < b.second ? a : b;
+    std::pair<int64_t, scalar_t> operator()(int64_t a_idx,
+                                            scalar_t a_val,
+                                            int64_t b_idx,
+                                            scalar_t b_val) const {
+        return a_val < b_val ? std::make_pair(a_idx, a_val)
+                             : std::make_pair(b_idx, b_val);
     }
 };
 
 template <typename scalar_t>
 struct ArgMaxReduction {
     using basic_reduction = sycl::maximum<scalar_t>;
-    std::pair<int64_t, scalar_t> operator()(
-            std::pair<int64_t, scalar_t> a,
-            std::pair<int64_t, scalar_t> b) const {
-        return a.second > b.second ? a : b;
+    std::pair<int64_t, scalar_t> operator()(int64_t a_idx,
+                                            scalar_t a_val,
+                                            int64_t b_idx,
+                                            scalar_t b_val) const {
+        return a_val > b_val ? std::make_pair(a_idx, a_val)
+                             : std::make_pair(b_idx, b_val);
     }
 };
 
+// TODO: This launches one kernel per output element, which can be inefficient
+// in cases where the reduction dim is small but the non-reduced dim is large.
+// Unit tests for a large number of outputs are disabled.
+// Speed-up by launching one kernel for the entire reduction.
 template <class ReductionOp, typename scalar_t>
 void SYCLReductionEngine(Device device, Indexer indexer, scalar_t identity) {
     auto device_props =
@@ -69,14 +77,12 @@ void SYCLReductionEngine(Device device, Indexer indexer, scalar_t identity) {
         auto num_work_items = num_work_groups * work_group_size;
 
         auto red_cg = [&](auto& cgh) {
-            auto output = reinterpret_cast<scalar_t*>(
-                    scalar_out_indexer.GetOutputPtr(0));
+            auto output = scalar_out_indexer.GetOutputPtr<scalar_t>(0);
             // Setting this still doesn't initialize to identity -
             // output buffer must be initialized separately.
             auto sycl_reducer = sycl::reduction(
                     output, identity, red_op,
                     {sycl::property::reduction::initialize_to_identity()});
-            sycl::stream out_stream(10240, 128, cgh);
             cgh.parallel_for(
                     sycl::nd_range<1>{num_work_items, work_group_size},
                     sycl_reducer, [=](sycl::nd_item<1> item, auto& red_arg) {
@@ -89,11 +95,11 @@ void SYCLReductionEngine(Device device, Indexer indexer, scalar_t identity) {
                             size_t idx =
                                     (i << log2workitems_per_group) + offset;
                             if (idx >= num_elements) break;
-                            auto val = *reinterpret_cast<scalar_t*>(
-                                    scalar_out_indexer.GetInputPtr(0, idx));
+                            auto val =
+                                    *scalar_out_indexer.GetInputPtr<scalar_t>(
+                                            0, idx);
                             item_out = red_op(item_out, val);
                         }
-                        out_stream << glob_id << ',' << item_out << '\t';
                         red_arg.combine(item_out);
                     });
         };
@@ -105,6 +111,9 @@ void SYCLReductionEngine(Device device, Indexer indexer, scalar_t identity) {
 
 // Based on OneAPI GPU optimization guide code sample (Blocked access to
 // input data + SYCL builtin reduction ops for final reduction)
+// TODO: This launches one kernel per output element, which can be inefficient
+// in cases where the reduction dim is small but the non-reduced dim is large.
+// Speed-up by launching one kernel for the entire reduction.
 template <class ReductionOp, typename scalar_t>
 void SYCLArgReductionEngine(Device device, Indexer indexer, scalar_t identity) {
     auto device_props =
@@ -146,47 +155,47 @@ void SYCLArgReductionEngine(Device device, Indexer indexer, scalar_t identity) {
             auto acc_in_use =
                     this_output_in_use
                             .get_access<sycl::access_mode::read_write>(cgh);
-            sycl::stream out_stream(10240, 1024, cgh);
             cgh.parallel_for(
                     sycl::nd_range<1>{num_work_items, work_group_size},
                     [=](sycl::nd_item<1> item) {
+                        auto& out_idx =
+                                *scalar_out_indexer.GetOutputPtr<int64_t>(0, 0);
+                        auto& out_val =
+                                *scalar_out_indexer.GetOutputPtr<scalar_t>(1,
+                                                                           0);
                         auto glob_id = item.get_global_id(0);
                         auto this_group = item.get_group();
                         auto offset = ((glob_id >> log2workitems_per_group)
                                        << log2elements_per_group) +
                                       (glob_id & mask);
-                        std::pair<int64_t, scalar_t> item_out{0, identity};
+                        int64_t it_idx = 0;
+                        scalar_t it_val = identity;
                         for (size_t i = 0; i < elements_per_work_item; i++) {
                             size_t idx =
                                     (i << log2workitems_per_group) + offset;
                             if (idx >= num_elements) break;
-                            auto val = *reinterpret_cast<scalar_t*>(
-                                    scalar_out_indexer.GetInputPtr(0, idx));
-                            item_out = red_op(item_out, {idx, val});
+                            auto val =
+                                    *scalar_out_indexer.GetInputPtr<scalar_t>(
+                                            0, idx);
+                            std::tie(it_idx, it_val) =
+                                    red_op(it_idx, it_val, idx, val);
                         }
-                        auto group_output_val = sycl::reduce_over_group(
-                                this_group, item_out.second, identity,
+                        auto group_out_val = sycl::reduce_over_group(
+                                this_group, it_val, identity,
                                 typename ReductionOp::basic_reduction());
                         // atomic (serial) reduction over all groups. SYCL does
                         // not have a barrier over groups. Work item(s) with min
                         // / max value update the output. (non-deterministic)
-                        if (item_out.second == group_output_val) {
-                            out_stream << "group_output: " << group_output_val
-                                       << item_out.first << sycl::endl;
-                            auto& out_idx = *reinterpret_cast<int64_t*>(
-                                    scalar_out_indexer.GetOutputPtr(0));
-                            auto& out_val = *reinterpret_cast<scalar_t*>(
-                                    scalar_out_indexer.GetOutputPtr(1));
+                        if (it_val == group_out_val) {
                             // TODO: Look for a better option to a spinlock
                             // mutex.
                             auto in_use = sycl::atomic_ref<
                                     int32_t, sycl::memory_order::acq_rel,
                                     sycl::memory_scope::device>(acc_in_use[0]);
                             while (in_use.exchange(1) == 1) {
                             }
-                            std::tie(out_idx, out_val) =
-                                    red_op({out_idx, out_val},
-                                           {item_out.first, group_output_val});
+                            std::tie(out_idx, out_val) = red_op(
+                                    out_idx, out_val, it_idx, group_out_val);
                             in_use.store(0);
                         }
                     });
@@ -273,7 +282,6 @@ void ReductionSYCL(const Tensor& src,
                     break;
             }
         });
-        utility::LogInfo("dst_acc: {}", dst_acc.ToString());
     } else if (s_boolean_reduce_ops.find(op_code) !=
                s_boolean_reduce_ops.end()) {
         if (src.GetDtype() != core::Bool) {

diff --git a/cpp/tests/core/Tensor.cpp b/cpp/tests/core/Tensor.cpp
@@ -1913,21 +1913,22 @@ TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumSpecialShapes) {
     EXPECT_EQ(dst.ToFlatVector<float>(), std::vector<float>({0}));
 }
 
-TEST_P(TensorPermuteDevicesWithSYCL, ReduceMultipleOutputsSumLargeArray) {
+TEST_P(TensorPermuteDevices, ReduceMultipleOutputsSumLargeArray) {
     core::Device device = GetParam();
-    core::SizeVector shape{3, 7, 8234719};
+    constexpr int64_t large = 8234719;
+    core::SizeVector shape{3, 7, large};
     int64_t size = shape.NumElements();
     std::vector<int> vals(size, 1);
     core::Tensor src(vals, shape, core::Int32, device);
     core::Tensor dst;
 
     dst = src.Sum({}, false);
-    EXPECT_EQ(dst.GetShape(), core::SizeVector({3, 7, 8234719}));
-    EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>(3 * 7 * 8234719, 1));
+    EXPECT_EQ(dst.GetShape(), core::SizeVector({3, 7, large}));
+    EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>(3 * 7 * large, 1));
 
     dst = src.Sum({0}, false);
-    EXPECT_EQ(dst.GetShape(), core::SizeVector({7, 8234719}));
-    EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>(7 * 8234719, 3));
+    EXPECT_EQ(dst.GetShape(), core::SizeVector({7, large}));
+    EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>(7 * large, 3));
 }
 
 TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit1D) {
@@ -1948,7 +1949,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit1D) {
 }
 
 // np.sum(np.ones((2, large_dim)), dim=0)
-TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase0) {
+TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase0) {
     core::Device device = GetParam();
     int64_t large_dim = (1ULL << 27) + 10;
     core::SizeVector shape{2, large_dim};
@@ -1972,7 +1973,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase0) {
 }
 
 // np.sum(np.ones((2, large_dim)), dim=1)
-TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase1) {
+TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase1) {
     core::Device device = GetParam();
     int64_t large_dim = (1ULL << 27) + 10;
     core::SizeVector shape{2, large_dim};
@@ -1996,7 +1997,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase1) {
 }
 
 // np.sum(np.ones((large_dim, 2)), dim=0)
-TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase2) {
+TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase2) {
     core::Device device = GetParam();
     int64_t large_dim = (1ULL << 27) + 10;
     core::SizeVector shape{large_dim, 2};
@@ -2020,7 +2021,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase2) {
 }
 
 // np.sum(np.ones((large_dim, 2)), dim=1)
-TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase3) {
+TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase3) {
     core::Device device = GetParam();
     int64_t large_dim = (1ULL << 27) + 10;
     core::SizeVector shape{large_dim, 2};

diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
@@ -88,7 +88,7 @@ RUN apt-get update && apt-get install -y \
     libffi-dev \
     liblzma-dev \
  && if [ "${BUILD_SYCL_MODULE}" = "ON" ]; then \
-        apt-get install g++-11; \
+        apt-get install -y g++-11; \
     fi \
  && rm -rf /var/lib/apt/lists/*
 # OneDPL TBB backend requires libstdc++ >= v11