Skip to content

Commit

Permalink
ArgReduction fix
Browse files Browse the repository at this point in the history
  • Loading branch information
ssheorey committed Dec 31, 2024
1 parent e6f4946 commit 1c99a2b
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 44 deletions.
2 changes: 1 addition & 1 deletion cpp/open3d/core/kernel/NonZeroSYCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ Tensor NonZeroSYCL(const Tensor& src) {
// MAX_DIMS: Maximum number of dimensions of TensorRef, defined in
// Indexer.h.
sycl::marray<int64_t, MAX_DIMS> shape_vec; // device copyable
if (shape.size() <= MAX_DIMS) {
if (shape.size() > MAX_DIMS) {
utility::LogError("Too many dimensions: {} > MAX_DIMS={}.",
shape.size(), MAX_DIMS);
}
Expand Down
72 changes: 40 additions & 32 deletions cpp/open3d/core/kernel/ReductionSYCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,31 @@ namespace {
template <typename scalar_t>
struct ArgMinReduction {
using basic_reduction = sycl::minimum<scalar_t>;
std::pair<int64_t, scalar_t> operator()(
std::pair<int64_t, scalar_t> a,
std::pair<int64_t, scalar_t> b) const {
return a.second < b.second ? a : b;
std::pair<int64_t, scalar_t> operator()(int64_t a_idx,
scalar_t a_val,
int64_t b_idx,
scalar_t b_val) const {
return a_val < b_val ? std::make_pair(a_idx, a_val)
: std::make_pair(b_idx, b_val);
}
};

template <typename scalar_t>
struct ArgMaxReduction {
using basic_reduction = sycl::maximum<scalar_t>;
std::pair<int64_t, scalar_t> operator()(
std::pair<int64_t, scalar_t> a,
std::pair<int64_t, scalar_t> b) const {
return a.second > b.second ? a : b;
std::pair<int64_t, scalar_t> operator()(int64_t a_idx,
scalar_t a_val,
int64_t b_idx,
scalar_t b_val) const {
return a_val > b_val ? std::make_pair(a_idx, a_val)
: std::make_pair(b_idx, b_val);
}
};

// TODO: This launches one kernel per output element, which can be inefficient
// in cases where the reduction dim is small but the non-reduced dim is large.
// Unit tests for a large number of outputs are disabled.
// Speed-up by launching one kernel for the entire reduction.
template <class ReductionOp, typename scalar_t>
void SYCLReductionEngine(Device device, Indexer indexer, scalar_t identity) {
auto device_props =
Expand Down Expand Up @@ -69,14 +77,12 @@ void SYCLReductionEngine(Device device, Indexer indexer, scalar_t identity) {
auto num_work_items = num_work_groups * work_group_size;

auto red_cg = [&](auto& cgh) {
auto output = reinterpret_cast<scalar_t*>(
scalar_out_indexer.GetOutputPtr(0));
auto output = scalar_out_indexer.GetOutputPtr<scalar_t>(0);
// Setting this still doesn't initialize to identity -
// output buffer must be initialized separately.
auto sycl_reducer = sycl::reduction(
output, identity, red_op,
{sycl::property::reduction::initialize_to_identity()});
sycl::stream out_stream(10240, 128, cgh);
cgh.parallel_for(
sycl::nd_range<1>{num_work_items, work_group_size},
sycl_reducer, [=](sycl::nd_item<1> item, auto& red_arg) {
Expand All @@ -89,11 +95,11 @@ void SYCLReductionEngine(Device device, Indexer indexer, scalar_t identity) {
size_t idx =
(i << log2workitems_per_group) + offset;
if (idx >= num_elements) break;
auto val = *reinterpret_cast<scalar_t*>(
scalar_out_indexer.GetInputPtr(0, idx));
auto val =
*scalar_out_indexer.GetInputPtr<scalar_t>(
0, idx);
item_out = red_op(item_out, val);
}
out_stream << glob_id << ',' << item_out << '\t';
red_arg.combine(item_out);
});
};
Expand All @@ -105,6 +111,9 @@ void SYCLReductionEngine(Device device, Indexer indexer, scalar_t identity) {

// Based on OneAPI GPU optimization guide code sample (Blocked access to
// input data + SYCL builtin reduction ops for final reduction)
// TODO: This launches one kernel per output element, which can be inefficient
// in cases where the reduction dim is small but the non-reduced dim is large.
// Speed-up by launching one kernel for the entire reduction.
template <class ReductionOp, typename scalar_t>
void SYCLArgReductionEngine(Device device, Indexer indexer, scalar_t identity) {
auto device_props =
Expand Down Expand Up @@ -146,47 +155,47 @@ void SYCLArgReductionEngine(Device device, Indexer indexer, scalar_t identity) {
auto acc_in_use =
this_output_in_use
.get_access<sycl::access_mode::read_write>(cgh);
sycl::stream out_stream(10240, 1024, cgh);
cgh.parallel_for(
sycl::nd_range<1>{num_work_items, work_group_size},
[=](sycl::nd_item<1> item) {
auto& out_idx =
*scalar_out_indexer.GetOutputPtr<int64_t>(0, 0);
auto& out_val =
*scalar_out_indexer.GetOutputPtr<scalar_t>(1,
0);
auto glob_id = item.get_global_id(0);
auto this_group = item.get_group();
auto offset = ((glob_id >> log2workitems_per_group)
<< log2elements_per_group) +
(glob_id & mask);
std::pair<int64_t, scalar_t> item_out{0, identity};
int64_t it_idx = 0;
scalar_t it_val = identity;
for (size_t i = 0; i < elements_per_work_item; i++) {
size_t idx =
(i << log2workitems_per_group) + offset;
if (idx >= num_elements) break;
auto val = *reinterpret_cast<scalar_t*>(
scalar_out_indexer.GetInputPtr(0, idx));
item_out = red_op(item_out, {idx, val});
auto val =
*scalar_out_indexer.GetInputPtr<scalar_t>(
0, idx);
std::tie(it_idx, it_val) =
red_op(it_idx, it_val, idx, val);
}
auto group_output_val = sycl::reduce_over_group(
this_group, item_out.second, identity,
auto group_out_val = sycl::reduce_over_group(
this_group, it_val, identity,
typename ReductionOp::basic_reduction());
// atomic (serial) reduction over all groups. SYCL does
// not have a barrier over groups. Work item(s) with min
// / max value update the output. (non-deterministic)
if (item_out.second == group_output_val) {
out_stream << "group_output: " << group_output_val
<< item_out.first << sycl::endl;
auto& out_idx = *reinterpret_cast<int64_t*>(
scalar_out_indexer.GetOutputPtr(0));
auto& out_val = *reinterpret_cast<scalar_t*>(
scalar_out_indexer.GetOutputPtr(1));
if (it_val == group_out_val) {
// TODO: Look for a better option to a spinlock
// mutex.
auto in_use = sycl::atomic_ref<
int32_t, sycl::memory_order::acq_rel,
sycl::memory_scope::device>(acc_in_use[0]);
while (in_use.exchange(1) == 1) {
}
std::tie(out_idx, out_val) =
red_op({out_idx, out_val},
{item_out.first, group_output_val});
std::tie(out_idx, out_val) = red_op(
out_idx, out_val, it_idx, group_out_val);
in_use.store(0);
}
});
Expand Down Expand Up @@ -273,7 +282,6 @@ void ReductionSYCL(const Tensor& src,
break;
}
});
utility::LogInfo("dst_acc: {}", dst_acc.ToString());
} else if (s_boolean_reduce_ops.find(op_code) !=
s_boolean_reduce_ops.end()) {
if (src.GetDtype() != core::Bool) {
Expand Down
21 changes: 11 additions & 10 deletions cpp/tests/core/Tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1913,21 +1913,22 @@ TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumSpecialShapes) {
EXPECT_EQ(dst.ToFlatVector<float>(), std::vector<float>({0}));
}

TEST_P(TensorPermuteDevicesWithSYCL, ReduceMultipleOutputsSumLargeArray) {
TEST_P(TensorPermuteDevices, ReduceMultipleOutputsSumLargeArray) {
core::Device device = GetParam();
core::SizeVector shape{3, 7, 8234719};
constexpr int64_t large = 8234719;
core::SizeVector shape{3, 7, large};
int64_t size = shape.NumElements();
std::vector<int> vals(size, 1);
core::Tensor src(vals, shape, core::Int32, device);
core::Tensor dst;

dst = src.Sum({}, false);
EXPECT_EQ(dst.GetShape(), core::SizeVector({3, 7, 8234719}));
EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>(3 * 7 * 8234719, 1));
EXPECT_EQ(dst.GetShape(), core::SizeVector({3, 7, large}));
EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>(3 * 7 * large, 1));

dst = src.Sum({0}, false);
EXPECT_EQ(dst.GetShape(), core::SizeVector({7, 8234719}));
EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>(7 * 8234719, 3));
EXPECT_EQ(dst.GetShape(), core::SizeVector({7, large}));
EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>(7 * large, 3));
}

TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit1D) {
Expand All @@ -1948,7 +1949,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit1D) {
}

// np.sum(np.ones((2, large_dim)), dim=0)
TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase0) {
TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase0) {
core::Device device = GetParam();
int64_t large_dim = (1ULL << 27) + 10;
core::SizeVector shape{2, large_dim};
Expand All @@ -1972,7 +1973,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase0) {
}

// np.sum(np.ones((2, large_dim)), dim=1)
TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase1) {
TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase1) {
core::Device device = GetParam();
int64_t large_dim = (1ULL << 27) + 10;
core::SizeVector shape{2, large_dim};
Expand All @@ -1996,7 +1997,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase1) {
}

// np.sum(np.ones((large_dim, 2)), dim=0)
TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase2) {
TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase2) {
core::Device device = GetParam();
int64_t large_dim = (1ULL << 27) + 10;
core::SizeVector shape{large_dim, 2};
Expand All @@ -2020,7 +2021,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase2) {
}

// np.sum(np.ones((large_dim, 2)), dim=1)
TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit2DCase3) {
TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase3) {
core::Device device = GetParam();
int64_t large_dim = (1ULL << 27) + 10;
core::SizeVector shape{large_dim, 2};
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ RUN apt-get update && apt-get install -y \
libffi-dev \
liblzma-dev \
&& if [ "${BUILD_SYCL_MODULE}" = "ON" ]; then \
apt-get install g++-11; \
apt-get install -y g++-11; \
fi \
&& rm -rf /var/lib/apt/lists/*
# OneDPL TBB backend requires libstdc++ >= v11
Expand Down

0 comments on commit 1c99a2b

Please sign in to comment.