From 641c8683ec9dcd9d69d82d7018713b67b5d97a8f Mon Sep 17 00:00:00 2001 From: amukkara <134339030+amukkara@users.noreply.github.com> Date: Fri, 8 Sep 2023 16:48:13 -0700 Subject: [PATCH] Add `dynamic_bitset` (#352) This PR adds `dynamic_bitset` code that will be used in Trie data structure. Trie will be integrated in a separate PR #350. Since `dynamic_bitset` is not intended to be part of public-facing API, all files (.cuh and .inl) are located in include/cuco/detail/trie/dynamic_bitset Tests are added in tests/dynamic_bitset --------- Co-authored-by: Yunsong Wang Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 375 ++++++++++++++++ .../trie/dynamic_bitset/dynamic_bitset.inl | 404 ++++++++++++++++++ .../detail/trie/dynamic_bitset/kernels.cuh | 240 +++++++++++ tests/CMakeLists.txt | 9 + tests/dynamic_bitset/find_next_test.cu | 73 ++++ tests/dynamic_bitset/get_test.cu | 69 +++ tests/dynamic_bitset/rank_test.cu | 56 +++ tests/dynamic_bitset/select_test.cu | 96 +++++ tests/dynamic_bitset/size_test.cu | 33 ++ 9 files changed, 1355 insertions(+) create mode 100644 include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh create mode 100644 include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl create mode 100644 include/cuco/detail/trie/dynamic_bitset/kernels.cuh create mode 100644 tests/dynamic_bitset/find_next_test.cu create mode 100644 tests/dynamic_bitset/get_test.cu create mode 100644 tests/dynamic_bitset/rank_test.cu create mode 100644 tests/dynamic_bitset/select_test.cu create mode 100644 tests/dynamic_bitset/size_test.cu diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh new file mode 100644 index 000000000..8383669fc --- /dev/null +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -0,0 +1,375 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +#include + +#include +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Struct to store ranks of bits at 256-bit intervals (or blocks) + * + * This struct encodes a list of four rank values using base + offset format + * e.g. [1000, 1005, 1006, 1009] is stored as base = 1000, offsets = [5, 6, 9] + * base uses 40 bits, split between one uint32_t and one uint8_t + * each offset uses 8 bits + */ +struct rank { + uint32_t base_hi_; ///< Upper 32 bits of base + uint8_t base_lo_; ///< Lower 8 bits of base + cuda::std::array offsets_; ///< Offsets for 64-bit sub-intervals, relative to base + + /** + * @brief Gets base rank of current 256-bit interval + * + * @return The base rank + */ + __host__ __device__ constexpr uint64_t base() const noexcept + { + return (static_cast(base_hi_) << CHAR_BIT) | base_lo_; + } + + /** + * @brief Sets base rank of current 256-bit interval + * + * @param base Base rank + */ + __host__ __device__ constexpr void set_base(uint64_t base) noexcept + { + base_hi_ = static_cast(base >> CHAR_BIT); + base_lo_ = static_cast(base); + } +}; + +/** + * @brief Bitset class with rank and select index structures + * + * In addition to standard bitset set/test operations, this class provides + * rank and select operation API. It maintains index structures to make both these + * new operations close to constant time. + * + * Current limitations: + * - Stream controls are partially supported due to the use of `thrust::device_vector` as storage + * - Device ref doesn't support modifiers like `set`, `reset`, etc. + * + * @tparam Allocator Type of allocator used for device storage + */ +// TODO: have to use device_malloc_allocator for now otherwise the container cannot grow +template > +class dynamic_bitset { + public: + using size_type = std::size_t; ///< size type to specify bit index + using word_type = uint64_t; ///< word type + /// Type of the allocator to (de)allocate words + using allocator_type = typename std::allocator_traits::rebind_alloc; + + /// Number of bits per block. Note this is a tradeoff between space efficiency and perf. + static constexpr size_type words_per_block = 4; + /// Number of bits in a word + static constexpr size_type bits_per_word = sizeof(word_type) * CHAR_BIT; + /// Number of bits in a block + static constexpr size_type bits_per_block = words_per_block * bits_per_word; + + /** + * @brief Constructs an empty bitset + * + * @param allocator Allocator used for allocating device storage + */ + constexpr dynamic_bitset(Allocator const& allocator = Allocator{}); + + /** + * @brief Appends the given element `value` to the end of the bitset + * + * This API may involve data reallocation if the current storage is exhausted. + * + * @param value Boolean value of the new bit to be added + */ + constexpr void push_back(bool value) noexcept; + + /** + * @brief Sets the target bit indexed by `index` to a specified `value`. + * + * @param index Position of bit to be modified + * @param value New value of the target bit + */ + constexpr void set(size_type index, bool value) noexcept; + + /** + * @brief Sets the last bit to a specified value + * + * @param value New value of the last bit + */ + constexpr void set_last(bool value) noexcept; + + /** + * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the + * boolean value at position `keys_begin[i]` to `output_begin[i]`. + * + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean + * type + * + * @param keys_begin Begin iterator to keys list whose values are queried + * @param keys_end End iterator to keys list + * @param outputs_begin Begin iterator to outputs of test operation + * @param stream Stream to execute test kernel + */ + template + constexpr void test(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) noexcept; + + /** + * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores total + * count of `1` bits preceeding (but not including) position `keys_begin[i]` to `output_begin[i]`. + * + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param keys_begin Begin iterator to keys list whose ranks are queried + * @param keys_end End iterator to keys list + * @param outputs_begin Begin iterator to outputs ranks list + * @param stream Stream to execute ranks kernel + */ + template + constexpr void rank(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) noexcept; + + /** + * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the + * position of `keys_begin[i]`th `1` bit to `output_begin[i]`. + * + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param keys_begin Begin iterator to keys list whose select values are queried + * @param keys_end End iterator to keys list + * @param outputs_begin Begin iterator to outputs selects list + * @param stream Stream to execute selects kernel + */ + template + constexpr void select(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) noexcept; + + using rank_type = cuco::experimental::detail::rank; ///< Rank type + + /** + *@brief Struct to hold all storage refs needed by reference + */ + // TODO: this is not a real ref type, to be changed + struct storage_ref_type { + const word_type* words_ref_; ///< Words ref + + const rank_type* ranks_true_ref_; ///< Ranks ref for 1 bits + const size_type* selects_true_ref_; ///< Selects ref for 1 bits + + const rank_type* ranks_false_ref_; ///< Ranks ref for 0 bits + const size_type* selects_false_ref_; ///< Selects ref 0 bits + }; + + /** + * @brief Device non-owning reference type of dynamic_bitset + */ + class reference { + public: + /** + * @brief Constructs a reference + * + * @param storage Struct with non-owning refs to bitset storage arrays + */ + __host__ __device__ explicit constexpr reference(storage_ref_type storage) noexcept; + + /** + * @brief Access value of a single bit + * + * @param key Position of bit + * + * @return Value of bit at position specified by key + */ + [[nodiscard]] __device__ constexpr bool test(size_type key) const noexcept; + + /** + * @brief Access a single word of internal storage + * + * @param word_id Index of word + * + * @return Word at position specified by index + */ + [[nodiscard]] __device__ constexpr word_type word(size_type word_id) const noexcept; + + /** + * @brief Find position of first set bit starting from a given position (inclusive) + * + * @param key Position of starting bit + * + * @return Index of next set bit + */ + [[nodiscard]] __device__ size_type find_next(size_type key) const noexcept; + + /** + * @brief Find number of set bits (rank) in all positions before the input position (exclusive) + * + * @param key Input bit position + * + * @return Rank of input position + */ + [[nodiscard]] __device__ constexpr size_type rank(size_type key) const noexcept; + + /** + * @brief Find position of Nth set (1) bit counting from start + * + * @param count Input N + * + * @return Position of Nth set bit + */ + [[nodiscard]] __device__ constexpr size_type select(size_type count) const noexcept; + + /** + * @brief Find position of Nth not-set (0) bit counting from start + * + * @param count Input N + * + * @return Position of Nth not-set bit + */ + [[nodiscard]] __device__ constexpr size_type select_false(size_type count) const noexcept; + + private: + /** + * @brief Helper function for select operation that computes an initial rank estimate + * + * @param count Input count for which select operation is being performed + * @param selects Selects array + * @param ranks Ranks array + * + * @return index in ranks which corresponds to highest rank less than count (least upper bound) + */ + template + [[nodiscard]] __device__ constexpr size_type initial_rank_estimate( + size_type count, const SelectsRef& selects, const RanksRef& ranks) const noexcept; + + /** + * @brief Subtract rank estimate from input count and return an increment to word_id + * + * @tparam Rank type + * + * @param count Input count that will be updated + * @param rank Initial rank estimate for count + * + * @return Increment to word_id based on rank values + */ + template + [[nodiscard]] __device__ constexpr size_type subtract_rank_from_count(size_type& count, + Rank rank) const noexcept; + + /** + * @brief Find position of Nth set bit in a 64-bit word + * + * @param N Input count + * + * @return Position of Nth set bit + */ + [[nodiscard]] __device__ size_type select_bit_in_word(size_type N, + word_type word) const noexcept; + + storage_ref_type storage_; ///< Non-owning storage + }; + + using ref_type = reference; ///< Non-owning container ref type + + /** + * @brief Gets non-owning device ref of the current object + * + * @return Device ref of the current `dynamic_bitset` object + */ + [[nodiscard]] constexpr ref_type ref() const noexcept; + + /** + * @brief Gets the number of bits dynamic_bitset holds + * + * @return Number of bits dynamic_bitset holds + */ + [[nodiscard]] constexpr size_type size() const noexcept; + + private: + /// Type of the allocator to (de)allocate ranks + using rank_allocator_type = typename std::allocator_traits::rebind_alloc; + /// Type of the allocator to (de)allocate indices + using size_allocator_type = typename std::allocator_traits::rebind_alloc; + + allocator_type allocator_; ///< Words allocator + size_type n_bits_; ///< Number of bits dynamic_bitset currently holds + bool is_built_; ///< Flag indicating whether the rank and select indices are built or not + + /// Words vector that represents all bits + thrust::device_vector words_; + /// Rank values for every 256-th bit (4-th word) + thrust::device_vector ranks_true_; + /// Same as ranks_ but for `0` bits + thrust::device_vector ranks_false_; + /// Block indices of (0, 256, 512...)th `1` bit + thrust::device_vector selects_true_; + /// Same as selects_, but for `0` bits + thrust::device_vector selects_false_; + + /** + * @brief Builds indexes for rank and select + * + * @param stream Stream to execute kernels + */ + constexpr void build(cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Populates rank and select indexes for true or false bits + * + * @param ranks Output array of ranks + * @param selects Output array of selects + * @param flip_bits If true, negate bits to construct indexes for false bits + * @param stream Stream to execute kernels + */ + constexpr void build_ranks_and_selects( + thrust::device_vector& ranks, + thrust::device_vector& selects, + bool flip_bits, + cuda_stream_ref stream = {}); +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl new file mode 100644 index 000000000..d56ef9d7c --- /dev/null +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -0,0 +1,404 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +template +constexpr dynamic_bitset::dynamic_bitset(Allocator const& allocator) + : allocator_{allocator}, + n_bits_{0}, + is_built_{false}, + words_{allocator}, + ranks_true_{allocator}, + ranks_false_{allocator}, + selects_true_{allocator}, + selects_false_{allocator} +{ +} + +template +constexpr void dynamic_bitset::push_back(bool bit) noexcept +{ + if (n_bits_ % bits_per_block == 0) { + words_.resize(words_.size() + words_per_block); // Extend storage by one block + } + + set(n_bits_++, bit); +} + +template +constexpr void dynamic_bitset::set(size_type index, bool bit) noexcept +{ + is_built_ = false; + size_type word_id = index / bits_per_word; + size_type bit_id = index % bits_per_word; + if (bit) { + words_[word_id] |= 1UL << bit_id; + } else { + words_[word_id] &= ~(1UL << bit_id); + } +} + +template +constexpr void dynamic_bitset::set_last(bool bit) noexcept +{ + set(n_bits_ - 1, bit); +} + +template +template +constexpr void dynamic_bitset::test(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) noexcept + +{ + build(); + auto const num_keys = cuco::detail::distance(keys_begin, keys_end); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys); + + bitset_test_kernel<<>>( + ref(), keys_begin, outputs_begin, num_keys); +} + +template +template +constexpr void dynamic_bitset::rank(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) noexcept +{ + build(); + auto const num_keys = cuco::detail::distance(keys_begin, keys_end); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys); + + bitset_rank_kernel<<>>( + ref(), keys_begin, outputs_begin, num_keys); +} + +template +template +constexpr void dynamic_bitset::select(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) noexcept + +{ + build(); + auto const num_keys = cuco::detail::distance(keys_begin, keys_end); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys); + + bitset_select_kernel<<>>( + ref(), keys_begin, outputs_begin, num_keys); +} + +template +constexpr void dynamic_bitset::build_ranks_and_selects( + thrust::device_vector& ranks, + thrust::device_vector& selects, + bool flip_bits, + cuda_stream_ref stream) +{ + if (n_bits_ == 0) { return; } + + // Step 1. Compute prefix sum of per-word bit counts + // Population counts for each word + size_type const num_words = words_.size(); + // Sized to have one extra entry for subsequent prefix sum + auto const bit_counts_size = num_words + 1; + + thrust::device_vector bit_counts(num_words + 1, this->allocator_); + auto const bit_counts_begin = thrust::raw_pointer_cast(bit_counts.data()); + + auto grid_size = cuco::detail::grid_size(num_words); + bit_counts_kernel<<>>( + thrust::raw_pointer_cast(words_.data()), bit_counts_begin, num_words, flip_bits); + + std::size_t temp_storage_bytes = 0; + using temp_allocator_type = typename std::allocator_traits::rebind_alloc; + auto temp_allocator = temp_allocator_type{this->allocator_}; + + CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, bit_counts_begin, bit_counts_begin, bit_counts_size, stream)); + + // Allocate temporary storage + auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum(thrust::raw_pointer_cast(d_temp_storage), + temp_storage_bytes, + bit_counts_begin, + bit_counts_begin, + bit_counts_size, + stream)); + + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); + + // Step 2. Compute ranks + auto const num_blocks = (num_words - 1) / words_per_block + 2; + ranks.resize(num_blocks); + + grid_size = cuco::detail::grid_size(num_blocks); + encode_ranks_from_prefix_bit_counts<<>>( + bit_counts_begin, + thrust::raw_pointer_cast(ranks.data()), + num_words, + num_blocks, + words_per_block); + + // Step 3. Compute selects + thrust::device_vector select_markers(num_blocks, + this->allocator_); + auto const select_markers_begin = thrust::raw_pointer_cast(select_markers.data()); + + mark_blocks_with_select_entries<<>>( + bit_counts_begin, select_markers_begin, num_blocks, words_per_block, bits_per_block); + + auto d_sum = reinterpret_cast(thrust::raw_pointer_cast( + std::allocator_traits::allocate(temp_allocator, sizeof(size_type)))); + CUCO_CUDA_TRY(cub::DeviceReduce::Sum( + nullptr, temp_storage_bytes, select_markers_begin, d_sum, num_blocks, stream)); + + d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceReduce::Sum(thrust::raw_pointer_cast(d_temp_storage), + temp_storage_bytes, + select_markers_begin, + d_sum, + num_blocks, + stream)); + + size_type num_selects{}; + CUCO_CUDA_TRY( + cudaMemcpyAsync(&num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); + stream.synchronize(); + std::allocator_traits::deallocate( + temp_allocator, thrust::device_ptr{reinterpret_cast(d_sum)}, sizeof(size_type)); + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); + + selects.resize(num_selects); + + auto const select_begin = thrust::raw_pointer_cast(selects.data()); + + CUCO_CUDA_TRY(cub::DeviceSelect::Flagged(nullptr, + temp_storage_bytes, + thrust::make_counting_iterator(0UL), + select_markers_begin, + select_begin, + thrust::make_discard_iterator(), + num_blocks, + stream)); + + d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceSelect::Flagged(thrust::raw_pointer_cast(d_temp_storage), + temp_storage_bytes, + thrust::make_counting_iterator(0UL), + select_markers_begin, + select_begin, + thrust::make_discard_iterator(), + num_blocks, + stream)); + + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); +} + +template +constexpr void dynamic_bitset::build(cuda_stream_ref stream) noexcept +{ + if (not is_built_) { + build_ranks_and_selects(ranks_true_, selects_true_, false, stream); // 1 bits + build_ranks_and_selects(ranks_false_, selects_false_, true, stream); // 0 bits + is_built_ = true; + } +} + +template +constexpr dynamic_bitset::ref_type dynamic_bitset::ref() const noexcept +{ + return ref_type{storage_ref_type{thrust::raw_pointer_cast(words_.data()), + thrust::raw_pointer_cast(ranks_true_.data()), + thrust::raw_pointer_cast(selects_true_.data()), + thrust::raw_pointer_cast(ranks_false_.data()), + thrust::raw_pointer_cast(selects_false_.data())}}; +} + +template +constexpr dynamic_bitset::size_type dynamic_bitset::size() const noexcept +{ + return n_bits_; +} + +// Device reference implementations + +template +__host__ __device__ constexpr dynamic_bitset::reference::reference( + storage_ref_type storage) noexcept + : storage_{storage} +{ +} + +template +__device__ constexpr bool dynamic_bitset::reference::test(size_type key) const noexcept +{ + return (storage_.words_ref_[key / bits_per_word] >> (key % bits_per_word)) & 1UL; +} + +template +__device__ constexpr typename dynamic_bitset::word_type +dynamic_bitset::reference::word(size_type word_id) const noexcept +{ + return storage_.words_ref_[word_id]; +} + +template +__device__ typename dynamic_bitset::size_type +dynamic_bitset::reference::find_next(size_type key) const noexcept +{ + size_type word_id = key / bits_per_word; + size_type bit_id = key % bits_per_word; + word_type word = storage_.words_ref_[word_id]; + word &= ~(0UL) << bit_id; + while (word == 0) { + word = storage_.words_ref_[++word_id]; + } + return word_id * bits_per_word + __ffsll(word) - 1; // cuda intrinsic +} + +template +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::rank(size_type key) const noexcept +{ + size_type word_id = key / bits_per_word; + size_type bit_id = key % bits_per_word; + size_type rank_id = word_id / words_per_block; + size_type offset_id = word_id % words_per_block; + + auto rank = storage_.ranks_true_ref_[rank_id]; + size_type n = rank.base(); + + if (offset_id != 0) { n += rank.offsets_[offset_id - 1]; } + + n += cuda::std::popcount(storage_.words_ref_[word_id] & ((1UL << bit_id) - 1)); + + return n; +} + +template +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::select(size_type count) const noexcept +{ + auto rank_id = initial_rank_estimate(count, storage_.selects_true_ref_, storage_.ranks_true_ref_); + auto rank = storage_.ranks_true_ref_[rank_id]; + + size_type word_id = rank_id * words_per_block; + word_id += subtract_rank_from_count(count, rank); + + return word_id * bits_per_word + select_bit_in_word(count, storage_.words_ref_[word_id]); +} + +template +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::select_false(size_type count) const noexcept +{ + auto rank_id = + initial_rank_estimate(count, storage_.selects_false_ref_, storage_.ranks_false_ref_); + auto rank = storage_.ranks_false_ref_[rank_id]; + + size_type word_id = rank_id * words_per_block; + word_id += subtract_rank_from_count(count, rank); + + return word_id * bits_per_word + select_bit_in_word(count, ~(storage_.words_ref_[word_id])); +} + +template +template +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::initial_rank_estimate(size_type count, + SelectsRef const& selects, + RanksRef const& ranks) const noexcept +{ + size_type block_id = count / (bits_per_word * words_per_block); + size_type begin = selects[block_id]; + size_type end = selects[block_id + 1] + 1UL; + + if (begin + 10 >= end) { // Linear search + while (count >= ranks[begin + 1].base()) { + ++begin; + } + } else { // Binary search + while (begin + 1 < end) { + size_type middle = (begin + end) / 2; + if (count < ranks[middle].base()) { + end = middle; + } else { + begin = middle; + } + } + } + return begin; +} + +template +template +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::subtract_rank_from_count(size_type& count, + Rank rank) const noexcept +{ + count -= rank.base(); + + bool a0 = count >= rank.offsets_[0]; + bool a1 = count >= rank.offsets_[1]; + bool a2 = count >= rank.offsets_[2]; + size_type inc = a0 + a1 + a2; + + count -= (inc > 0) * rank.offsets_[inc - (inc > 0)]; + + return inc; +} + +template +__device__ typename dynamic_bitset::size_type +dynamic_bitset::reference::select_bit_in_word(size_type N, word_type word) const noexcept +{ + for (size_type pos = 0; pos < N; pos++) { + word &= word - 1; + } + return __ffsll(word & -word) - 1; // cuda intrinsic +} +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh new file mode 100644 index 000000000..c92ab60b2 --- /dev/null +++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh @@ -0,0 +1,240 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/* + * @brief Test bits for a range of keys + * + * @tparam BitsetRef Bitset reference type + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean + * type + * + * @param ref Bitset ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitset_test_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) +{ + auto key_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (key_id < num_keys) { + outputs[key_id] = ref.test(keys[key_id]); + key_id += stride; + } +} + +/* + * @brief Gather rank values for a range of keys + * + * @tparam BitsetRef Bitset reference type + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param ref Bitset ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitset_rank_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) +{ + auto key_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (key_id < num_keys) { + outputs[key_id] = ref.rank(keys[key_id]); + key_id += stride; + } +} + +/* + * @brief Gather select values for a range of keys + * + * @tparam BitsetRef Bitset reference type + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param ref Bitset ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitset_select_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) +{ + auto key_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (key_id < num_keys) { + outputs[key_id] = ref.select(keys[key_id]); + key_id += stride; + } +} + +/* + * @brief Computes number of set or not-set bits in each word + * + * @tparam WordType Word type + * @tparam SizeType Size type + * + * @param words Input array of words + * @param bit_counts Output array of per-word bit counts + * @param num_words Number of words + * @param flip_bits Boolean to request negation of words before counting bits + */ +template +__global__ void bit_counts_kernel(WordType const* words, + SizeType* bit_counts, + cuco::detail::index_type num_words, + bool flip_bits) +{ + auto word_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (word_id < num_words) { + auto word = words[word_id]; + bit_counts[word_id] = cuda::std::popcount(flip_bits ? ~word : word); + word_id += stride; + } +} + +/* + * @brief Compute rank values at block size intervals. + * + * ranks[i] = Number of set bits in [0, i) range + * This kernel transforms prefix sum array of per-word bit counts + * into base-delta encoding style of `rank` struct. + * Since prefix sum is available, there are no dependencies across blocks. + + * @tparam SizeType Size type + * + * @param prefix_bit_counts Prefix sum array of per-word bit counts + * @param ranks Output array of ranks + * @param num_words Length of input array + * @param num_blocks Length of ouput array + * @param words_per_block Number of words in each block + */ +template +__global__ void encode_ranks_from_prefix_bit_counts(const SizeType* prefix_bit_counts, + rank* ranks, + SizeType num_words, + SizeType num_blocks, + SizeType words_per_block) +{ + auto rank_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (rank_id < num_blocks) { + SizeType word_id = rank_id * words_per_block; + + // Set base value of rank + auto& rank = ranks[rank_id]; + rank.set_base(prefix_bit_counts[word_id]); + + if (rank_id < num_blocks - 1) { + // For each subsequent word in this block, compute deltas from base + for (SizeType block_offset = 0; block_offset < words_per_block - 1; block_offset++) { + auto delta = prefix_bit_counts[word_id + block_offset + 1] - prefix_bit_counts[word_id]; + rank.offsets_[block_offset] = delta; + } + } + rank_id += stride; + } +} + +/* + * @brief Compute select values at block size intervals. + * + * selects[i] = Position of (i+ 1)th set bit + * This kernel check for blocks where prefix sum crosses a multiple of `bits_per_block`. + * Such blocks are marked in the output boolean array + * + * @tparam SizeType Size type + * + * @param prefix_bit_counts Prefix sum array of per-word bit counts + * @param selects_markers Ouput array indicating whether a block has selects entry or not + * @param num_blocks Length of ouput array + * @param words_per_block Number of words in each block + * @param bits_per_block Number of bits in each block + */ +template +__global__ void mark_blocks_with_select_entries(SizeType const* prefix_bit_counts, + SizeType* select_markers, + SizeType num_blocks, + SizeType words_per_block, + SizeType bits_per_block) +{ + auto block_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (block_id < num_blocks) { + if (block_id == 0) { // Block 0 always has a selects entry + select_markers[block_id] = 1; + block_id += stride; + continue; + } + + select_markers[block_id] = 0; // Always clear marker first + SizeType word_id = block_id * words_per_block; + SizeType prev_count = prefix_bit_counts[word_id]; + + for (size_t block_offset = 1; block_offset <= words_per_block; block_offset++) { + SizeType count = prefix_bit_counts[word_id + block_offset]; + + // Selects entry is added when cumulative bitcount crosses a multiple of bits_per_block + if ((prev_count - 1) / bits_per_block != (count - 1) / bits_per_block) { + select_markers[block_id] = 1; + break; + } + prev_count = count; + } + + block_id += stride; + } +} + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d78ec7f49..3deeeddf1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -96,3 +96,12 @@ ConfigureTest(STATIC_MULTIMAP_TEST static_multimap/multiplicity_test.cu static_multimap/non_match_test.cu static_multimap/pair_function_test.cu) + +################################################################################################### +# - dynamic_bitset tests -------------------------------------------------------------------------- +ConfigureTest(DYNAMIC_BITSET_TEST + dynamic_bitset/find_next_test.cu + dynamic_bitset/get_test.cu + dynamic_bitset/rank_test.cu + dynamic_bitset/select_test.cu + dynamic_bitset/size_test.cu) diff --git a/tests/dynamic_bitset/find_next_test.cu b/tests/dynamic_bitset/find_next_test.cu new file mode 100644 index 000000000..97ba366ea --- /dev/null +++ b/tests/dynamic_bitset/find_next_test.cu @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +template +__global__ void find_next_kernel(BitsetRef ref, size_type num_elements, OutputIt output) +{ + cuco::detail::index_type index = blockIdx.x * blockDim.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; + while (index < num_elements) { + output[index] = ref.find_next(index); + index += stride; + } +} + +extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu + +TEST_CASE("Find next set test", "") +{ + cuco::experimental::detail::dynamic_bitset bv; + + using size_type = std::size_t; + constexpr size_type num_elements{400}; + + for (size_type i = 0; i < num_elements; i++) { + bv.push_back(modulo_bitgen(i)); + } + + thrust::device_vector device_result(num_elements); + auto ref = bv.ref(); + find_next_kernel<<<1, 1024>>>(ref, num_elements, device_result.data()); + + thrust::host_vector host_result = device_result; + size_type num_matches = 0; + + size_type next_set_pos = -1lu; + do { + next_set_pos++; + } while (next_set_pos < num_elements and !modulo_bitgen(next_set_pos)); + + for (size_type key = 0; key < num_elements; key++) { + num_matches += host_result[key] == next_set_pos; + + if (key == next_set_pos) { + do { + next_set_pos++; + } while (next_set_pos < num_elements and !modulo_bitgen(next_set_pos)); + } + } + REQUIRE(num_matches == num_elements); +} diff --git a/tests/dynamic_bitset/get_test.cu b/tests/dynamic_bitset/get_test.cu new file mode 100644 index 000000000..10f81a116 --- /dev/null +++ b/tests/dynamic_bitset/get_test.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +template +__global__ void test_kernel(BitsetRef ref, size_type num_elements, OutputIt output) +{ + cuco::detail::index_type index = blockIdx.x * blockDim.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; + while (index < num_elements) { + output[index] = ref.test(index); + index += stride; + } +} + +bool modulo_bitgen(uint64_t i) { return i % 7 == 0; } + +TEST_CASE("Get test", "") +{ + cuco::experimental::detail::dynamic_bitset bv; + + using size_type = std::size_t; + constexpr size_type num_elements{400}; + + size_type num_set_ref = 0; + for (size_type i = 0; i < num_elements; i++) { + bv.push_back(modulo_bitgen(i)); + num_set_ref += modulo_bitgen(i); + } + + // Host-bulk test + thrust::device_vector keys(num_elements); + thrust::sequence(keys.begin(), keys.end(), 0); + + thrust::device_vector test_result(num_elements); + thrust::fill(test_result.begin(), test_result.end(), 0); + + bv.test(keys.begin(), keys.end(), test_result.begin()); + + size_type num_set = thrust::reduce(thrust::device, test_result.begin(), test_result.end(), 0); + REQUIRE(num_set == num_set_ref); + + // Device-ref test + auto ref = bv.ref(); + thrust::fill(test_result.begin(), test_result.end(), 0); + test_kernel<<<1, 1024>>>(ref, num_elements, test_result.data()); + + num_set = thrust::reduce(thrust::device, test_result.begin(), test_result.end(), 0); + REQUIRE(num_set == num_set_ref); +} diff --git a/tests/dynamic_bitset/rank_test.cu b/tests/dynamic_bitset/rank_test.cu new file mode 100644 index 000000000..3b4d17cca --- /dev/null +++ b/tests/dynamic_bitset/rank_test.cu @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu + +TEST_CASE("Rank test", "") +{ + cuco::experimental::detail::dynamic_bitset bv; + + using size_type = std::size_t; + constexpr size_type num_elements{4000}; + + for (size_type i = 0; i < num_elements; i++) { + bv.push_back(modulo_bitgen(i)); + } + + thrust::device_vector keys(num_elements); + thrust::sequence(keys.begin(), keys.end(), 0); + + thrust::device_vector d_ranks(num_elements); + + bv.rank(keys.begin(), keys.end(), d_ranks.begin()); + + thrust::host_vector h_ranks = d_ranks; + + size_type cur_rank = 0; + size_type num_matches = 0; + for (size_type i = 0; i < num_elements; i++) { + num_matches += cur_rank == h_ranks[i]; + if (modulo_bitgen(i)) { cur_rank++; } + } + REQUIRE(num_matches == num_elements); +} diff --git a/tests/dynamic_bitset/select_test.cu b/tests/dynamic_bitset/select_test.cu new file mode 100644 index 000000000..3dc0d74da --- /dev/null +++ b/tests/dynamic_bitset/select_test.cu @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +template +__global__ void select_false_kernel(BitsetRef ref, size_type num_elements, OutputIt output) +{ + cuco::detail::index_type index = blockIdx.x * blockDim.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; + while (index < num_elements) { + output[index] = ref.select_false(index); + index += stride; + } +} + +extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu + +TEST_CASE("Select test", "") +{ + cuco::experimental::detail::dynamic_bitset bv; + + using size_type = std::size_t; + constexpr size_type num_elements{4000}; + + size_type num_set = 0; + for (size_type i = 0; i < num_elements; i++) { + bv.push_back(modulo_bitgen(i)); + num_set += modulo_bitgen(i); + } + + // Check select + { + thrust::device_vector keys(num_set); + thrust::sequence(keys.begin(), keys.end(), 0); + + thrust::device_vector d_selects(num_set); + + bv.select(keys.begin(), keys.end(), d_selects.begin()); + + thrust::host_vector h_selects = d_selects; + + size_type num_matches = 0; + size_type cur_set_pos = -1lu; + for (size_type i = 0; i < num_set; i++) { + do { + cur_set_pos++; + } while (cur_set_pos < num_elements and !modulo_bitgen(cur_set_pos)); + + num_matches += cur_set_pos == h_selects[i]; + } + REQUIRE(num_matches == num_set); + } + + // Check select_false + { + size_type num_not_set = num_elements - num_set; + + auto ref = bv.ref(); + thrust::device_vector device_result(num_not_set); + select_false_kernel<<<1, 1024>>>(ref, num_not_set, device_result.data()); + thrust::host_vector host_result = device_result; + + size_type num_matches = 0; + size_type cur_not_set_pos = -1lu; + for (size_type i = 0; i < num_not_set; i++) { + do { + cur_not_set_pos++; + } while (cur_not_set_pos < num_elements and modulo_bitgen(cur_not_set_pos)); + + num_matches += cur_not_set_pos == host_result[i]; + } + REQUIRE(num_matches == num_not_set); + } +} diff --git a/tests/dynamic_bitset/size_test.cu b/tests/dynamic_bitset/size_test.cu new file mode 100644 index 000000000..611159dc3 --- /dev/null +++ b/tests/dynamic_bitset/size_test.cu @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +TEST_CASE("Size computation", "") +{ + cuco::experimental::detail::dynamic_bitset bv; + using size_type = std::size_t; + constexpr size_type num_elements{400}; + + for (size_type i = 0; i < num_elements; i++) { + bv.push_back(i % 2 == 0); // Alternate 0s and 1s pattern + } + + auto size = bv.size(); + REQUIRE(size == num_elements); +}